library(DataExplorer)
library(plumber)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(ggplot2)
library(dplyr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(corrplot)
## corrplot 0.92 loaded
library(MatchIt)
library(caTools)
library(glm2)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(cluster)
ab_dataset_cap <- read_csv("ibmdatacleaned.csv")
## Rows: 23156 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): Department, EducationField, JobRole, Attrition, BusinessTravel, Ed...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeNumber, Envir...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Conversion of Numeric Variables
numeric_vars <- c("Age", "DailyRate", "DistanceFromHome", "Education", "EmployeeNumber",
"EnvironmentSatisfaction", "HourlyRate", "JobInvolvement", "JobLevel",
"JobSatisfaction", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked",
"PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction",
"StandardHours", "StockOptionLevel", "TotalWorkingYears",
"TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany",
"YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager", "Index")
# Ensure that the variables actually exists in the dataframe before conversion
existing_numeric_vars <- numeric_vars[numeric_vars %in% names(ab_dataset_cap)]
# Convert existing variables to numeric
ab_dataset_cap[existing_numeric_vars] <- lapply(ab_dataset_cap[existing_numeric_vars], function(x) as.numeric(as.character(x)))
# Update: Convert categorical character variables to factors (As previously defined)
categorical_vars <- c("Department", "EducationField", "JobRole", "Attrition", "BusinessTravel",
"Education Desc", "EnvironmentSatisfaction Desc", "Gender",
"JobInvolvement Desc", "JobSatisfaction Desc", "MaritalStatus",
"OverTime", "WorkLifeBalance Desc", "Employee Source")
# Filter existing categorical variables
existing_categorical_vars <- categorical_vars[categorical_vars %in% names(ab_dataset_cap)]
ab_dataset_cap$`RelationshipSatisfaction Desc` <- as.factor(ab_dataset_cap$`RelationshipSatisfaction Desc`)
# Convert to factor
ab_dataset_cap[existing_categorical_vars] <- lapply(ab_dataset_cap[existing_categorical_vars], factor)
# Print structure to verify changes
str(ab_dataset_cap)
## spc_tbl_ [23,156 × 43] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Department : Factor w/ 3 levels "Human Resources",..: 1 3 1 1 1 3 3 3 3 3 ...
## $ EducationField : Factor w/ 7 levels "0","Human Resources",..: 2 3 4 2 4 3 3 3 3 3 ...
## $ JobRole : Factor w/ 13 levels "Human Resources Associate",..: 3 11 3 3 3 11 11 11 12 12 ...
## $ Age : num [1:23156] 37 41 37 37 37 41 41 41 41 41 ...
## $ Attrition : Factor w/ 2 levels "Current employee",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ DailyRate : num [1:23156] 807 1102 807 807 807 ...
## $ DistanceFromHome : num [1:23156] 6 1 6 6 6 1 1 1 1 1 ...
## $ Education : num [1:23156] 4 2 4 4 4 2 2 2 2 2 ...
## $ Education Desc : Factor w/ 5 levels "Bachelor","Below College",..: 5 3 5 5 5 3 3 3 3 3 ...
## $ EmployeeNumber : num [1:23156] 1 1 4 5 6 7 8 9 10 11 ...
## $ Unique Emp ID : chr [1:23156] "Human Resources_1" "Sales_1" "Human Resources_4" "Human Resources_5" ...
## $ Application ID : chr [1:23156] "123457" "123456" "123459" "123460" ...
## $ EnvironmentSatisfaction : num [1:23156] 1 2 1 1 1 2 2 2 4 1 ...
## $ EnvironmentSatisfaction Desc : Factor w/ 4 levels "High","Low","Medium",..: 2 3 2 2 2 3 3 3 4 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
## $ HourlyRate : num [1:23156] 37 94 37 37 37 94 94 94 33 41 ...
## $ JobInvolvement : num [1:23156] 3 3 3 3 3 3 3 3 3 3 ...
## $ JobInvolvement Desc : Factor w/ 4 levels "High","Low","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ JobLevel : num [1:23156] 2 2 2 2 2 2 2 2 4 5 ...
## $ JobSatisfaction : num [1:23156] 4 4 4 4 4 4 4 4 3 1 ...
## $ JobSatisfaction Desc : Factor w/ 4 levels "High","Low","Medium",..: 4 4 4 4 4 4 4 4 1 2 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 3 3 3 3 3 3 3 1 2 ...
## $ MonthlyIncome : num [1:23156] 5993 5993 5993 5993 5993 ...
## $ MonthlyRate : num [1:23156] 19479 19479 19479 19479 19479 ...
## $ NumCompaniesWorked : num [1:23156] 8 8 5 8 5 8 4 8 2 5 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
## $ PercentSalaryHike : num [1:23156] 11 11 11 11 11 11 11 11 14 11 ...
## $ PerformanceRating : num [1:23156] 4 3 3 3 3 3 3 3 3 3 ...
## $ RelationshipSatisfaction : num [1:23156] 1 1 1 1 1 1 1 1 3 4 ...
## $ RelationshipSatisfaction Desc: Factor w/ 4 levels "High","Low","Medium",..: 2 2 2 2 2 2 2 2 1 4 ...
## $ StandardHours : num [1:23156] 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : num [1:23156] 0 0 0 0 0 0 0 0 3 0 ...
## $ TotalWorkingYears : num [1:23156] 8 8 8 8 8 8 8 8 21 33 ...
## $ TrainingTimesLastYear : num [1:23156] 0 0 0 0 0 0 0 0 2 5 ...
## $ WorkLifeBalance : num [1:23156] 1 1 1 1 1 1 1 1 3 1 ...
## $ WorkLifeBalance Desc : Factor w/ 4 levels "Bad","Best","Better",..: 1 1 1 1 1 1 1 1 3 1 ...
## $ YearsAtCompany : num [1:23156] 6 6 6 6 6 6 6 6 5 29 ...
## $ YearsInCurrentRole : num [1:23156] 4 4 4 4 4 4 4 4 0 8 ...
## $ YearsSinceLastPromotion : num [1:23156] 0 0 0 0 0 0 0 0 0 11 ...
## $ YearsWithCurrManager : num [1:23156] 5 5 5 5 5 5 5 5 2 10 ...
## $ Employee Source : Factor w/ 10 levels "Adzuna","Company Website",..: 8 8 8 8 8 8 8 8 2 4 ...
## $ Index : num [1:23156] 0 1 2 3 4 5 6 7 8 9 ...
## - attr(*, "spec")=
## .. cols(
## .. Department = col_character(),
## .. EducationField = col_character(),
## .. JobRole = col_character(),
## .. Age = col_double(),
## .. Attrition = col_character(),
## .. BusinessTravel = col_character(),
## .. DailyRate = col_double(),
## .. DistanceFromHome = col_double(),
## .. Education = col_double(),
## .. `Education Desc` = col_character(),
## .. EmployeeNumber = col_double(),
## .. `Unique Emp ID` = col_character(),
## .. `Application ID` = col_character(),
## .. EnvironmentSatisfaction = col_double(),
## .. `EnvironmentSatisfaction Desc` = col_character(),
## .. Gender = col_character(),
## .. HourlyRate = col_double(),
## .. JobInvolvement = col_double(),
## .. `JobInvolvement Desc` = col_character(),
## .. JobLevel = col_double(),
## .. JobSatisfaction = col_double(),
## .. `JobSatisfaction Desc` = col_character(),
## .. MaritalStatus = col_character(),
## .. MonthlyIncome = col_double(),
## .. MonthlyRate = col_double(),
## .. NumCompaniesWorked = col_double(),
## .. OverTime = col_character(),
## .. PercentSalaryHike = col_double(),
## .. PerformanceRating = col_double(),
## .. RelationshipSatisfaction = col_double(),
## .. `RelationshipSatisfaction Desc` = col_character(),
## .. StandardHours = col_double(),
## .. StockOptionLevel = col_double(),
## .. TotalWorkingYears = col_double(),
## .. TrainingTimesLastYear = col_double(),
## .. WorkLifeBalance = col_double(),
## .. `WorkLifeBalance Desc` = col_character(),
## .. YearsAtCompany = col_double(),
## .. YearsInCurrentRole = col_double(),
## .. YearsSinceLastPromotion = col_double(),
## .. YearsWithCurrManager = col_double(),
## .. `Employee Source` = col_character(),
## .. Index = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Calculate the number of missing values in each column
missing_values <- sapply(ab_dataset_cap, function(x) sum(is.na(x)))
missing_values
## Department EducationField
## 0 0
## JobRole Age
## 0 0
## Attrition BusinessTravel
## 0 0
## DailyRate DistanceFromHome
## 0 0
## Education Education Desc
## 0 0
## EmployeeNumber Unique Emp ID
## 0 0
## Application ID EnvironmentSatisfaction
## 0 0
## EnvironmentSatisfaction Desc Gender
## 0 0
## HourlyRate JobInvolvement
## 0 0
## JobInvolvement Desc JobLevel
## 0 0
## JobSatisfaction JobSatisfaction Desc
## 0 0
## MaritalStatus MonthlyIncome
## 0 0
## MonthlyRate NumCompaniesWorked
## 0 0
## OverTime PercentSalaryHike
## 0 0
## PerformanceRating RelationshipSatisfaction
## 0 0
## RelationshipSatisfaction Desc StandardHours
## 0 0
## StockOptionLevel TotalWorkingYears
## 0 0
## TrainingTimesLastYear WorkLifeBalance
## 0 0
## WorkLifeBalance Desc YearsAtCompany
## 0 0
## YearsInCurrentRole YearsSinceLastPromotion
## 0 0
## YearsWithCurrManager Employee Source
## 0 0
## Index
## 0
#Summary of Missing Values
# Summary of missing (NA) values
na_summary <- ab_dataset_cap %>%
summarise_all(~sum(is.na(.))) %>%
mutate(Source = "NA")
# If interested in zeros as potential placeholders for missing data
zero_summary <- ab_dataset_cap %>%
summarise_all(~sum(. == 0, na.rm = TRUE)) %>%
mutate(Source = "Zero")
# Combine summaries
summary_combined <- bind_rows(na_summary, zero_summary)
print(summary_combined)
## # A tibble: 2 × 44
## Department EducationField JobRole Age Attrition BusinessTravel DailyRate
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0
## 2 0 125 0 0 0 0 0
## # ℹ 37 more variables: DistanceFromHome <int>, Education <int>,
## # `Education Desc` <int>, EmployeeNumber <int>, `Unique Emp ID` <int>,
## # `Application ID` <int>, EnvironmentSatisfaction <int>,
## # `EnvironmentSatisfaction Desc` <int>, Gender <int>, HourlyRate <int>,
## # JobInvolvement <int>, `JobInvolvement Desc` <int>, JobLevel <int>,
## # JobSatisfaction <int>, `JobSatisfaction Desc` <int>, MaritalStatus <int>,
## # MonthlyIncome <int>, MonthlyRate <int>, NumCompaniesWorked <int>, …
# Calculate total count for normalization
total_count_numeric <- sum(table(ab_dataset_cap$Attrition))
# Visualize the distribution of Attrition_numeric with numbers and rounded percentages
ggplot(ab_dataset_cap, aes(x = factor(Attrition), fill = factor(Attrition))) +
geom_bar() +
geom_text(
aes(label = paste(..count.., " (", round(..count../total_count_numeric*100, 2), "%)", sep="")),
stat = 'count',
position = position_stack(vjust = 0.5)
) +
theme_minimal() +
labs(title = "Distribution of Attrition", x = "Attrition Status", y = "Count") +
scale_y_continuous(labels = scales::comma)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colnames(ab_dataset_cap)
## [1] "Department" "EducationField"
## [3] "JobRole" "Age"
## [5] "Attrition" "BusinessTravel"
## [7] "DailyRate" "DistanceFromHome"
## [9] "Education" "Education Desc"
## [11] "EmployeeNumber" "Unique Emp ID"
## [13] "Application ID" "EnvironmentSatisfaction"
## [15] "EnvironmentSatisfaction Desc" "Gender"
## [17] "HourlyRate" "JobInvolvement"
## [19] "JobInvolvement Desc" "JobLevel"
## [21] "JobSatisfaction" "JobSatisfaction Desc"
## [23] "MaritalStatus" "MonthlyIncome"
## [25] "MonthlyRate" "NumCompaniesWorked"
## [27] "OverTime" "PercentSalaryHike"
## [29] "PerformanceRating" "RelationshipSatisfaction"
## [31] "RelationshipSatisfaction Desc" "StandardHours"
## [33] "StockOptionLevel" "TotalWorkingYears"
## [35] "TrainingTimesLastYear" "WorkLifeBalance"
## [37] "WorkLifeBalance Desc" "YearsAtCompany"
## [39] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [41] "YearsWithCurrManager" "Employee Source"
## [43] "Index"
## # A tibble: 1 × 9
## rows columns discrete_columns continuous_columns all_missing_columns
## <int> <int> <int> <int> <int>
## 1 23156 43 17 26 0
## # ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
## # total_observations <int>, memory_usage <dbl>
## spc_tbl_ [23,156 × 43] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Department : Factor w/ 3 levels "Human Resources",..: 1 3 1 1 1 3 3 3 3 3 ...
## $ EducationField : Factor w/ 7 levels "0","Human Resources",..: 2 3 4 2 4 3 3 3 3 3 ...
## $ JobRole : Factor w/ 13 levels "Human Resources Associate",..: 3 11 3 3 3 11 11 11 12 12 ...
## $ Age : num [1:23156] 37 41 37 37 37 41 41 41 41 41 ...
## $ Attrition : Factor w/ 2 levels "Current employee",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ DailyRate : num [1:23156] 807 1102 807 807 807 ...
## $ DistanceFromHome : num [1:23156] 6 1 6 6 6 1 1 1 1 1 ...
## $ Education : num [1:23156] 4 2 4 4 4 2 2 2 2 2 ...
## $ Education Desc : Factor w/ 5 levels "Bachelor","Below College",..: 5 3 5 5 5 3 3 3 3 3 ...
## $ EmployeeNumber : num [1:23156] 1 1 4 5 6 7 8 9 10 11 ...
## $ Unique Emp ID : chr [1:23156] "Human Resources_1" "Sales_1" "Human Resources_4" "Human Resources_5" ...
## $ Application ID : chr [1:23156] "123457" "123456" "123459" "123460" ...
## $ EnvironmentSatisfaction : num [1:23156] 1 2 1 1 1 2 2 2 4 1 ...
## $ EnvironmentSatisfaction Desc : Factor w/ 4 levels "High","Low","Medium",..: 2 3 2 2 2 3 3 3 4 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
## $ HourlyRate : num [1:23156] 37 94 37 37 37 94 94 94 33 41 ...
## $ JobInvolvement : num [1:23156] 3 3 3 3 3 3 3 3 3 3 ...
## $ JobInvolvement Desc : Factor w/ 4 levels "High","Low","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ JobLevel : num [1:23156] 2 2 2 2 2 2 2 2 4 5 ...
## $ JobSatisfaction : num [1:23156] 4 4 4 4 4 4 4 4 3 1 ...
## $ JobSatisfaction Desc : Factor w/ 4 levels "High","Low","Medium",..: 4 4 4 4 4 4 4 4 1 2 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 3 3 3 3 3 3 3 1 2 ...
## $ MonthlyIncome : num [1:23156] 5993 5993 5993 5993 5993 ...
## $ MonthlyRate : num [1:23156] 19479 19479 19479 19479 19479 ...
## $ NumCompaniesWorked : num [1:23156] 8 8 5 8 5 8 4 8 2 5 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
## $ PercentSalaryHike : num [1:23156] 11 11 11 11 11 11 11 11 14 11 ...
## $ PerformanceRating : num [1:23156] 4 3 3 3 3 3 3 3 3 3 ...
## $ RelationshipSatisfaction : num [1:23156] 1 1 1 1 1 1 1 1 3 4 ...
## $ RelationshipSatisfaction Desc: Factor w/ 4 levels "High","Low","Medium",..: 2 2 2 2 2 2 2 2 1 4 ...
## $ StandardHours : num [1:23156] 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : num [1:23156] 0 0 0 0 0 0 0 0 3 0 ...
## $ TotalWorkingYears : num [1:23156] 8 8 8 8 8 8 8 8 21 33 ...
## $ TrainingTimesLastYear : num [1:23156] 0 0 0 0 0 0 0 0 2 5 ...
## $ WorkLifeBalance : num [1:23156] 1 1 1 1 1 1 1 1 3 1 ...
## $ WorkLifeBalance Desc : Factor w/ 4 levels "Bad","Best","Better",..: 1 1 1 1 1 1 1 1 3 1 ...
## $ YearsAtCompany : num [1:23156] 6 6 6 6 6 6 6 6 5 29 ...
## $ YearsInCurrentRole : num [1:23156] 4 4 4 4 4 4 4 4 0 8 ...
## $ YearsSinceLastPromotion : num [1:23156] 0 0 0 0 0 0 0 0 0 11 ...
## $ YearsWithCurrManager : num [1:23156] 5 5 5 5 5 5 5 5 2 10 ...
## $ Employee Source : Factor w/ 10 levels "Adzuna","Company Website",..: 8 8 8 8 8 8 8 8 2 4 ...
## $ Index : num [1:23156] 0 1 2 3 4 5 6 7 8 9 ...
## - attr(*, "spec")=
## .. cols(
## .. Department = col_character(),
## .. EducationField = col_character(),
## .. JobRole = col_character(),
## .. Age = col_double(),
## .. Attrition = col_character(),
## .. BusinessTravel = col_character(),
## .. DailyRate = col_double(),
## .. DistanceFromHome = col_double(),
## .. Education = col_double(),
## .. `Education Desc` = col_character(),
## .. EmployeeNumber = col_double(),
## .. `Unique Emp ID` = col_character(),
## .. `Application ID` = col_character(),
## .. EnvironmentSatisfaction = col_double(),
## .. `EnvironmentSatisfaction Desc` = col_character(),
## .. Gender = col_character(),
## .. HourlyRate = col_double(),
## .. JobInvolvement = col_double(),
## .. `JobInvolvement Desc` = col_character(),
## .. JobLevel = col_double(),
## .. JobSatisfaction = col_double(),
## .. `JobSatisfaction Desc` = col_character(),
## .. MaritalStatus = col_character(),
## .. MonthlyIncome = col_double(),
## .. MonthlyRate = col_double(),
## .. NumCompaniesWorked = col_double(),
## .. OverTime = col_character(),
## .. PercentSalaryHike = col_double(),
## .. PerformanceRating = col_double(),
## .. RelationshipSatisfaction = col_double(),
## .. `RelationshipSatisfaction Desc` = col_character(),
## .. StandardHours = col_double(),
## .. StockOptionLevel = col_double(),
## .. TotalWorkingYears = col_double(),
## .. TrainingTimesLastYear = col_double(),
## .. WorkLifeBalance = col_double(),
## .. `WorkLifeBalance Desc` = col_character(),
## .. YearsAtCompany = col_double(),
## .. YearsInCurrentRole = col_double(),
## .. YearsSinceLastPromotion = col_double(),
## .. YearsWithCurrManager = col_double(),
## .. `Employee Source` = col_character(),
## .. Index = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Convert Attrition to a binary numeric variable
ab_dataset_cap$Attrition_numeric <- as.numeric(ab_dataset_cap$Attrition == "Voluntary Resignation")
# Calculate correlations with Attrition
correlations <- cor(ab_dataset_cap[, sapply(ab_dataset_cap, is.numeric)])
## Warning in cor(ab_dataset_cap[, sapply(ab_dataset_cap, is.numeric)]): the
## standard deviation is zero
cor_attrition <- correlations[,"Attrition_numeric"]
sorted_cor <- sort(cor_attrition, decreasing = TRUE)
# View the correlations sorted
print(sorted_cor)
## Attrition_numeric DistanceFromHome NumCompaniesWorked
## 1.000000000 0.069913505 0.040475987
## HourlyRate Index EmployeeNumber
## 0.012403772 -0.002001987 -0.002019309
## RelationshipSatisfaction PerformanceRating MonthlyRate
## -0.005611355 -0.006735761 -0.008343636
## YearsSinceLastPromotion WorkLifeBalance PercentSalaryHike
## -0.018987128 -0.024119391 -0.024932085
## Education TrainingTimesLastYear EnvironmentSatisfaction
## -0.035920166 -0.045957616 -0.047779190
## JobSatisfaction DailyRate YearsAtCompany
## -0.053336866 -0.058597018 -0.061023246
## YearsWithCurrManager JobInvolvement MonthlyIncome
## -0.066725958 -0.067270649 -0.076901671
## TotalWorkingYears JobLevel YearsInCurrentRole
## -0.082217359 -0.082659988 -0.085523590
## StockOptionLevel Age
## -0.093021550 -0.152760524
DistanceFromHome (0.069913505): Positive correlation (0.07) with Attrition_numeric suggests that there is a slight tendency for employees who live farther from the workplace to have a slightly higher likelihood of attrition.
NumCompaniesWorked (0.040475987): Positive correlation (0.04) with Attrition_numeric suggests that employees who have worked at more companies before joining the current one may have a slightly higher tendency to leave.
HourlyRate (0.012403772): Very weak positive correlation (0.01) with Attrition_numeric, indicating that there’s almost no relationship between hourly rate and attrition.
Index (-0.002001987): This variable seems to have almost no correlation with Attrition_numeric or other variables, indicated by a very close to zero coefficient.
EmployeeNumber (-0.002019309): Almost no correlation with Attrition_numeric or other variables.
RelationshipSatisfaction (-0.005611355): Very weak negative correlation (-0.01) with Attrition_numeric, suggesting that employees with higher relationship satisfaction might have a slightly lower likelihood of attrition.
PerformanceRating (-0.006735761): Very weak negative correlation (-0.01) with Attrition_numeric, implying that employees with higher performance ratings might have a slightly lower likelihood of attrition.
MonthlyRate (-0.008343636): Very weak negative correlation (-0.01) with Attrition_numeric, indicating that there’s almost no relationship between monthly rate and attrition.
YearsSinceLastPromotion (-0.018987128): Weak negative correlation (-0.02) with Attrition_numeric, suggesting that employees who have been promoted more recently may have a slightly lower likelihood of attrition.
# Creating a subset by excluding specified columns
ab_dataset_reduced <- ab_dataset_cap[, !(names(ab_dataset_cap) %in% c(
"Education Desc",
"EnvironmentSatisfaction Desc",
"JobInvolvement Desc",
"JobSatisfaction Desc",
"RelationshipSatisfaction Desc",
"WorkLifeBalance Desc",
"EmployeeNumber",
"Unique Emp ID",
"Application ID",
"Index",
"StandardHours",
"Attrition",
"Employee Source"
))]
# Calculate correlation matrix for numeric variables
numeric_vars <- sapply(ab_dataset_reduced, is.numeric)
cor_matrix <- cor(ab_dataset_reduced[, numeric_vars], use = "pairwise.complete.obs")
# Melt the correlation matrix into a long format
melted_cor_matrix <- melt(cor_matrix)
# Create the heatmap with ggplot2
ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(name = "Correlation", low = "blue", mid = "white", high = "red", midpoint = 0) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.text.y = element_text(angle = 0),
axis.title.x = element_blank(),
axis.title.y = element_blank()) +
labs(title = "Correlation Matrix Heatmap with initial selected Variables")
colnames(ab_dataset_reduced)
## [1] "Department" "EducationField"
## [3] "JobRole" "Age"
## [5] "BusinessTravel" "DailyRate"
## [7] "DistanceFromHome" "Education"
## [9] "EnvironmentSatisfaction" "Gender"
## [11] "HourlyRate" "JobInvolvement"
## [13] "JobLevel" "JobSatisfaction"
## [15] "MaritalStatus" "MonthlyIncome"
## [17] "MonthlyRate" "NumCompaniesWorked"
## [19] "OverTime" "PercentSalaryHike"
## [21] "PerformanceRating" "RelationshipSatisfaction"
## [23] "StockOptionLevel" "TotalWorkingYears"
## [25] "TrainingTimesLastYear" "WorkLifeBalance"
## [27] "YearsAtCompany" "YearsInCurrentRole"
## [29] "YearsSinceLastPromotion" "YearsWithCurrManager"
## [31] "Attrition_numeric"
# Create a more appealing bar plot using ggplot2
ggplot(ab_dataset_reduced, aes(x=factor(Attrition_numeric), fill=factor(Attrition_numeric))) +
geom_bar(stat="count", position="dodge", color="black") +
geom_text(stat='count', aes(label=..count..), vjust=-0.5, color="black") +
scale_fill_manual(values=c("#56B4E9", "#E69F00"), labels=c("No Attrition (0)", "Attrition (1)")) +
scale_y_continuous(labels=scales::comma) +
labs(title="Attrition Numeric Distribution", x="Attrition Status", y="Frequency", fill="Attrition Status") +
theme_minimal() +
theme(text = element_text(size=12, face="bold"),
plot.title = element_text(hjust = 0.5),
legend.title = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_line(size = 0.1, linetype = 'solid', color = "gray"),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = "white"),
legend.position = "bottom")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Convert 'Attrition_numeric' to a factor
ab_dataset_reduced$Attrition_numeric <- as.factor(ab_dataset_reduced$Attrition_numeric)
# Check for multicollinearity
cor_matrix <- cor(ab_dataset_reduced[, sapply(ab_dataset_reduced, is.numeric)], use = "pairwise.complete.obs")
# Identify high correlations excluding self-correlations
high_cor <- which(abs(cor_matrix) > 0.7 & cor_matrix != 1, arr.ind = TRUE)
high_cor_vars <- data.frame(
Var1 = rownames(cor_matrix)[high_cor[, 1]],
Var2 = colnames(cor_matrix)[high_cor[, 2]],
Correlation = cor_matrix[high_cor]
)
print(high_cor_vars)
## Var1 Var2 Correlation
## 1 MonthlyIncome JobLevel 0.9431299
## 2 TotalWorkingYears JobLevel 0.7665359
## 3 JobLevel MonthlyIncome 0.9431299
## 4 TotalWorkingYears MonthlyIncome 0.7640761
## 5 MonthlyRate MonthlyRate 1.0000000
## 6 JobLevel TotalWorkingYears 0.7665359
## 7 MonthlyIncome TotalWorkingYears 0.7640761
## 8 TrainingTimesLastYear TrainingTimesLastYear 1.0000000
## 9 YearsInCurrentRole YearsAtCompany 0.7583112
## 10 YearsWithCurrManager YearsAtCompany 0.7705981
## 11 YearsAtCompany YearsInCurrentRole 0.7583112
## 12 YearsWithCurrManager YearsInCurrentRole 0.7158998
## 13 YearsAtCompany YearsWithCurrManager 0.7705981
## 14 YearsInCurrentRole YearsWithCurrManager 0.7158998
# Positive Correlations
p1 <- ab_dataset_reduced %>% select(TotalWorkingYears, MonthlyIncome) %>%
ggplot(aes(x=TotalWorkingYears, y=MonthlyIncome)) +
geom_point(colour = "#6D98BA", alpha=0.5) +
geom_smooth(method="loess", color="#D95F02") +
theme_minimal() +
theme(legend.position="bottom",
plot.title=element_text(hjust=0.5),
plot.subtitle=element_text(hjust=0.5)) +
labs(title="Positive Correlation", subtitle="Monthly Income vs Working Years")
# Arrange the plots into a grid
plot_grid(p1)
## `geom_smooth()` using formula = 'y ~ x'
# YearsInCurrentRole vs Attrition_numeric
mean_years_in_current_role <- mean(ab_dataset_reduced$YearsInCurrentRole)
p2 <- ab_dataset_reduced %>% select(YearsInCurrentRole, Attrition_numeric) %>%
ggplot(aes(x = YearsInCurrentRole, fill = Attrition_numeric)) +
geom_histogram(position = "dodge", bins = 20, color = "black", alpha = 0.7) +
geom_vline(xintercept = mean_years_in_current_role, linetype = "dashed", size = 1, color = "red",
aes(label = paste("Mean:", round(mean_years_in_current_role, 2)))) +
scale_color_manual(values = c("red")) +
theme_minimal() +
labs(title = "Attrition by Years in Current Role", x = "Years in Current Role", y = "Count", fill = "Attrition") +
theme(legend.position = "top")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p2)
# TrainingTimesLastYear vs Attrition_numeric
mean_training_times_last_year <- mean(ab_dataset_reduced$TrainingTimesLastYear)
p3 <- ab_dataset_reduced %>% select(TrainingTimesLastYear, Attrition_numeric) %>%
ggplot(aes(x = TrainingTimesLastYear, fill = Attrition_numeric)) +
geom_histogram(position = "dodge", bins = 20, color = "black", alpha = 0.7) +
geom_vline(xintercept = mean_training_times_last_year, linetype = "dashed", size = 1, color = "red",
aes(label = paste("Mean:", round(mean_training_times_last_year, 2)))) +
scale_color_manual(values = c("red")) +
theme_minimal() +
labs(title = "Attrition by Training Times Last Year", x = "Training Times Last Year", y = "Count", fill = "Attrition") +
theme(legend.position = "top")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
print(p3)
# DistanceFromHome vs Attrition_numeric
mean_distance_from_home <- mean(ab_dataset_reduced$DistanceFromHome)
p4 <- ab_dataset_reduced %>% select(DistanceFromHome, Attrition_numeric) %>%
ggplot(aes(x = DistanceFromHome, fill = Attrition_numeric)) +
geom_histogram(position = "dodge", bins = 20, color = "black", alpha = 0.7) +
geom_vline(xintercept = mean_distance_from_home, linetype = "dashed", size = 1, color = "red",
aes(label = paste("Mean:", round(mean_distance_from_home, 2)))) +
scale_color_manual(values = c("red")) +
theme_minimal() +
labs(title = "Attrition by Distance from Home", x = "Distance from Home", y = "Count", fill = "Attrition") +
theme(legend.position = "top")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
print(p4)
mean_ages <- ab_dataset_reduced %>%
group_by(Attrition_numeric) %>%
summarise(
MeanAge = mean(Age, na.rm = TRUE),
MaxCount = max(table(Age))
)
# Compute mean ages and max count for each attrition status
mean_ages <- ab_dataset_reduced %>%
group_by(Attrition_numeric) %>%
summarise(
MeanAge = mean(Age, na.rm = TRUE),
MaxCount = max(table(Age))
)
# Create the histogram
age_distribution_plot <- ab_dataset_reduced %>%
ggplot(aes(x = Age, fill = factor(Attrition_numeric))) +
geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
scale_fill_manual(values = c("#5DA5B3", "#26547C"),
labels = c("Stayed", "Left"),
name = "Attrition Status") +
geom_vline(data = mean_ages,
aes(xintercept = MeanAge, color = factor(Attrition_numeric)),
linetype = "dashed",
size = 1.5) +
geom_text(data = mean_ages,
aes(x = MeanAge, label = paste("Mean:", round(MeanAge, 1)), y = MaxCount, color = factor(Attrition_numeric)),
size = 4, vjust = 1.5, fontface = "bold", color = "black") +
scale_color_manual(values = c("#5DA5B3", "#26547C")) +
guides(color = FALSE) +
theme_minimal() +
labs(title = "Age Distribution by Attrition Status",
x = "Age",
y = "Count") +
theme(legend.position = "top",
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
legend.background = element_blank(),
legend.key = element_blank())
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print the interactive plot
print(age_distribution_plot)
Total Working Years vs Attrition
# Positive Correlations
p1 <- ab_dataset_reduced %>% select(Age, MonthlyIncome) %>%
ggplot(aes(x=Age, y=MonthlyIncome)) +
geom_point(colour = "#6D98BA", alpha=0.5) +
geom_smooth(method="loess", color="#D95F02") +
theme_minimal() +
theme(legend.position="bottom",
plot.title=element_text(hjust=0.5),
plot.subtitle=element_text(hjust=0.5)) +
labs(title="Positive Correlation", subtitle="Monthly Income vs Age")
# Arrange the plots into a grid
plot_grid(p1)
## `geom_smooth()` using formula = 'y ~ x'
# Define the color palette
color_palette <- brewer.pal(9, "Blues")
mean_ages_dept <- ab_dataset_reduced %>%
group_by(Department, Attrition_numeric) %>%
summarise(
MeanAge = mean(Age, na.rm = TRUE),
MaxCount = max(table(Age)),
.groups = 'drop' # This will drop the grouping so that we don't carry it into our plot
)
age_distribution_plot_dept <- ab_dataset_reduced %>%
ggplot(aes(x = Department, fill = factor(Attrition_numeric))) +
geom_bar(position = 'dodge', alpha = 0.6) +
scale_fill_manual(values = c("#6D88BA", "#D93F02"),
labels = c("Stayed", "Left"),
name = "Attrition Status") +
geom_text(data = mean_ages_dept,
aes(x = Department, y = MaxCount,
label = paste("Mean:", round(MeanAge, 1)),
color = factor(Attrition_numeric)),
size = 4, vjust = -0.5, fontface = "bold",
position = position_dodge(width = 0.9)) +
scale_color_manual(values = c("black", "black")) + # Set both colors to white
theme_minimal() +
labs(title = "Attrition Distribution by Department",
x = "Department",
y = "Count") +
theme(legend.position = "top",
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
legend.text = element_text(color = "black"), # Ensure legend text is black for visibility
legend.background = element_blank(),
legend.key = element_blank(),
legend.title = element_text(color = "black")) + # Ensure legend title is black for visibility
guides(color = FALSE) # Remove the color guide for the text
# Print the plot
print(age_distribution_plot_dept)
# Scale the numeric variables in the dataset "ab_dataset_reduced"
numeric_vars <- sapply(ab_dataset_reduced, is.numeric)
ab_dataset_reduced[numeric_vars] <- scale(ab_dataset_reduced[numeric_vars])
# Ensure the dataset is in a data frame format
ab_dataset_reduced <- as.data.frame(ab_dataset_reduced)
# View the structure of the dataset to confirm changes
str(ab_dataset_reduced)
## 'data.frame': 23156 obs. of 31 variables:
## $ Department : Factor w/ 3 levels "Human Resources",..: 1 3 1 1 1 3 3 3 3 3 ...
## $ EducationField : Factor w/ 7 levels "0","Human Resources",..: 2 3 4 2 4 3 3 3 3 3 ...
## $ JobRole : Factor w/ 13 levels "Human Resources Associate",..: 3 11 3 3 3 11 11 11 12 12 ...
## $ Age : num 0.00925 0.44817 0.00925 0.00925 0.00925 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ DailyRate : num 0.012 0.744 0.012 0.012 0.012 ...
## $ DistanceFromHome : num -0.394 -1.011 -0.394 -0.394 -0.394 ...
## $ Education : num 0.974 -1.086 0.974 0.974 0.974 ...
## $ EnvironmentSatisfaction : num -1.571 -0.658 -1.571 -1.571 -1.571 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
## $ HourlyRate : num -1.42 1.38 -1.42 -1.42 -1.42 ...
## $ JobInvolvement : num 0.379 0.379 0.379 0.379 0.379 ...
## $ JobLevel : num -0.0588 -0.0588 -0.0588 -0.0588 -0.0588 ...
## $ JobSatisfaction : num 1.16 1.16 1.16 1.16 1.16 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 3 3 3 3 3 3 3 1 2 ...
## $ MonthlyIncome : num -0.109 -0.109 -0.109 -0.109 -0.109 ...
## $ MonthlyRate : num 0.73 0.73 0.73 0.73 0.73 ...
## $ NumCompaniesWorked : num 2.126 2.126 0.925 2.126 0.925 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
## $ PercentSalaryHike : num -1.15 -1.15 -1.15 -1.15 -1.15 ...
## $ PerformanceRating : num 2.333 -0.429 -0.429 -0.429 -0.429 ...
## $ RelationshipSatisfaction: num -1.59 -1.59 -1.59 -1.59 -1.59 ...
## $ StockOptionLevel : num -0.934 -0.934 -0.934 -0.934 -0.934 ...
## $ TotalWorkingYears : num -0.421 -0.421 -0.421 -0.421 -0.421 ...
## $ TrainingTimesLastYear : num -2.17 -2.17 -2.17 -2.17 -2.17 ...
## $ WorkLifeBalance : num -2.5 -2.5 -2.5 -2.5 -2.5 ...
## $ YearsAtCompany : num -0.166 -0.166 -0.166 -0.166 -0.166 ...
## $ YearsInCurrentRole : num -0.0637 -0.0637 -0.0637 -0.0637 -0.0637 ...
## $ YearsSinceLastPromotion : num -0.679 -0.679 -0.679 -0.679 -0.679 ...
## $ YearsWithCurrManager : num 0.243 0.243 0.243 0.243 0.243 ...
## $ Attrition_numeric : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
summary(ab_dataset_reduced[numeric_vars])
## Age DailyRate DistanceFromHome Education
## Min. :-2.0756 Min. :-1.7366401 Min. :-1.0113 Min. :-2.11567
## 1st Qu.:-0.7589 1st Qu.:-0.8362733 1st Qu.:-0.8878 1st Qu.:-1.08579
## Median :-0.1005 Median :-0.0003957 Median :-0.2704 Median :-0.05591
## Mean : 0.0000 Mean : 0.0000000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6676 3rd Qu.: 0.8801283 3rd Qu.: 0.5940 3rd Qu.: 0.97398
## Max. : 2.5330 Max. : 1.7284077 Max. : 2.4462 Max. : 2.00386
## EnvironmentSatisfaction HourlyRate JobInvolvement
## Min. :-1.5711 Min. :-1.763489 Min. :-2.4383
## 1st Qu.:-0.6576 1st Qu.:-0.878104 1st Qu.:-1.0297
## Median : 0.2559 Median : 0.007282 Median : 0.3789
## Mean : 0.0000 Mean : 0.000000 Mean : 0.0000
## 3rd Qu.: 1.1693 3rd Qu.: 0.843479 3rd Qu.: 0.3789
## Max. : 1.1693 Max. : 1.679677 Max. : 1.7876
## JobLevel JobSatisfaction MonthlyIncome MonthlyRate
## Min. :-0.96299 Min. :-1.5685 Min. :-1.1681 Min. :-1.71862
## 1st Qu.:-0.96299 1st Qu.:-0.6605 1st Qu.:-0.7640 1st Qu.:-0.88050
## Median :-0.05884 Median : 0.2476 Median :-0.3337 Median :-0.01111
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.84530 3rd Qu.: 1.1556 3rd Qu.: 0.3980 3rd Qu.: 0.86589
## Max. : 2.65359 Max. : 1.1556 Max. : 2.8666 Max. : 1.78894
## NumCompaniesWorked PercentSalaryHike PerformanceRating
## Min. :-1.0760 Min. :-1.1518 Min. :-0.4286
## 1st Qu.:-0.6758 1st Qu.:-0.8785 1st Qu.:-0.4286
## Median :-0.2756 Median :-0.3320 Median :-0.4286
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5249 3rd Qu.: 0.7612 3rd Qu.:-0.4286
## Max. : 2.5260 Max. : 2.6741 Max. : 2.3330
## RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## Min. :-1.5864 Min. :-0.9336 Min. :-1.4512
## 1st Qu.:-0.6609 1st Qu.:-0.9336 1st Qu.:-0.6783
## Median : 0.2647 Median : 0.2415 Median :-0.1631
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.1902 3rd Qu.: 0.2415 3rd Qu.: 0.4810
## Max. : 1.1902 Max. : 2.5918 Max. : 3.7013
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :-2.1699 Min. :-2.4981 Min. :-1.1433 Min. :-1.1665
## 1st Qu.:-0.6195 1st Qu.:-1.0804 1st Qu.:-0.6544 1st Qu.:-0.6151
## Median : 0.1557 Median : 0.3374 Median :-0.3285 Median :-0.3394
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.1557 3rd Qu.: 0.3374 3rd Qu.: 0.4863 3rd Qu.: 0.7634
## Max. : 2.4812 Max. : 1.7552 Max. : 5.3753 Max. : 3.7961
## YearsSinceLastPromotion YearsWithCurrManager
## Min. :-0.6787 Min. :-1.1565
## 1st Qu.:-0.6787 1st Qu.:-0.5966
## Median :-0.3676 Median :-0.3167
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.2546 3rd Qu.: 0.8032
## Max. : 3.9875 Max. : 3.6027
set.seed(123) # for reproducible random sampling
# Split the data into training and testing sets
index <- createDataPartition(y = ab_dataset_reduced$Attrition_numeric, p = 0.7, list = FALSE)
training_set <- ab_dataset_reduced[index, ]
testing_set <- ab_dataset_reduced[-index, ]
# Check the number of cases in each set
nrow(training_set) # Should be approximately 70% of the data
## [1] 16210
nrow(testing_set) # Should be approximately 30% of the data
## [1] 6946
# Convert Attrition_numeric to a factor with explicit levels in both the training and testing sets
training_set$Attrition_numeric <- factor(training_set$Attrition_numeric, levels = c(0, 1))
testing_set$Attrition_numeric <- factor(testing_set$Attrition_numeric, levels = c(0, 1))
# Check initial class distribution
table(training_set$Attrition_numeric)
##
## 0 1
## 13650 2560
# Balance the classes
training_set <- downSample(x = training_set[, -which(names(training_set) == "Attrition_numeric")],
y = training_set$Attrition_numeric)
# Check the new class distribution to confirm balancing
table(training_set$Class)
##
## 0 1
## 2560 2560
# Renaming 'Class' back to 'Attrition_numeric'
training_set$Attrition_numeric <- training_set$Class
training_set$Class <- NULL
# Calculate counts and percentages
attrition_counts <- table(training_set$Attrition_numeric)
attrition_df <- data.frame(Attrition_numeric = names(attrition_counts),
Count = as.integer(attrition_counts))
attrition_df$Percentage <- (attrition_df$Count / sum(attrition_df$Count)) * 100
# Create the bar plot
ggplot(attrition_df, aes(x = Attrition_numeric, y = Count, fill = Attrition_numeric)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste(Count, " (", sprintf("%.1f", Percentage), "%)", sep="")),
vjust = -0.5, position = position_stack(vjust = 0.5)) +
labs(title = "Distribution of Attrition Numeric", x = "Attrition Numeric Status", y = "Count") +
theme_minimal()
names(training_set)
## [1] "Department" "EducationField"
## [3] "JobRole" "Age"
## [5] "BusinessTravel" "DailyRate"
## [7] "DistanceFromHome" "Education"
## [9] "EnvironmentSatisfaction" "Gender"
## [11] "HourlyRate" "JobInvolvement"
## [13] "JobLevel" "JobSatisfaction"
## [15] "MaritalStatus" "MonthlyIncome"
## [17] "MonthlyRate" "NumCompaniesWorked"
## [19] "OverTime" "PercentSalaryHike"
## [21] "PerformanceRating" "RelationshipSatisfaction"
## [23] "StockOptionLevel" "TotalWorkingYears"
## [25] "TrainingTimesLastYear" "WorkLifeBalance"
## [27] "YearsAtCompany" "YearsInCurrentRole"
## [29] "YearsSinceLastPromotion" "YearsWithCurrManager"
## [31] "Attrition_numeric"
# Fit logistic regression model on training set
logistic_model <- glm(Attrition_numeric ~ ., data = training_set, family = binomial())
# Print out the results of the logistic regression model
summary(logistic_model)
##
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = training_set)
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.224359 0.562372 -3.955 7.64e-05
## DepartmentResearch & Development -0.067411 0.184810 -0.365 0.715292
## DepartmentSales 0.648032 0.215177 3.012 0.002599
## EducationFieldHuman Resources 0.159301 0.613398 0.260 0.795094
## EducationFieldLife Sciences 0.390465 0.519361 0.752 0.452160
## EducationFieldMarketing 0.358039 0.534105 0.670 0.502633
## EducationFieldMedical 0.381208 0.518736 0.735 0.462414
## EducationFieldOther 0.268788 0.532633 0.505 0.613812
## EducationFieldTechnical Degree 0.836985 0.529792 1.580 0.114144
## JobRoleHuman Resources Director 0.891298 0.823537 1.082 0.279128
## JobRoleHuman Resources Executive 0.051361 0.494169 0.104 0.917221
## JobRoleHuman Resources Manager -13.803497 188.646196 -0.073 0.941670
## JobRoleHuman Resources Representative 0.104206 0.239413 0.435 0.663376
## JobRoleLaboratory Technician 0.126137 0.099904 1.263 0.206738
## JobRoleManufacturing Director 0.016855 0.142587 0.118 0.905900
## JobRoleResearch & Development Manager 0.039227 0.242293 0.162 0.871387
## JobRoleResearch Director 0.128618 0.150145 0.857 0.391654
## JobRoleResearch Scientist NA NA NA NA
## JobRoleSales Executive -0.429819 0.144299 -2.979 0.002895
## JobRoleSales Manager -0.241953 0.336164 -0.720 0.471680
## JobRoleSales Representative NA NA NA NA
## Age -0.395401 0.032964 -11.995 < 2e-16
## BusinessTravelTravel_Frequently 1.435087 0.134116 10.700 < 2e-16
## BusinessTravelTravel_Rarely 0.745049 0.121789 6.118 9.50e-10
## DailyRate -0.171833 0.031455 -5.463 4.69e-08
## DistanceFromHome 0.202683 0.034596 5.859 4.67e-09
## Education -0.006071 0.032201 -0.189 0.850467
## EnvironmentSatisfaction -0.199345 0.035150 -5.671 1.42e-08
## GenderMale 0.138165 0.065023 2.125 0.033599
## HourlyRate 0.120516 0.035517 3.393 0.000691
## JobInvolvement -0.191046 0.030604 -6.242 4.31e-10
## JobLevel -0.077732 0.106475 -0.730 0.465359
## JobSatisfaction -0.131958 0.031040 -4.251 2.13e-05
## MaritalStatusMarried 0.235618 0.088224 2.671 0.007570
## MaritalStatusSingle 0.628525 0.110642 5.681 1.34e-08
## MonthlyIncome -0.152886 0.108200 -1.413 0.157656
## MonthlyRate -0.047991 0.031908 -1.504 0.132574
## NumCompaniesWorked 0.025198 0.034039 0.740 0.459146
## OverTimeYes 0.887989 0.067250 13.204 < 2e-16
## PercentSalaryHike -0.135431 0.034704 -3.902 9.52e-05
## PerformanceRating 0.077937 0.035245 2.211 0.027015
## RelationshipSatisfaction -0.059815 0.031050 -1.926 0.054050
## StockOptionLevel -0.088709 0.040250 -2.204 0.027530
## TotalWorkingYears 0.139481 0.059914 2.328 0.019911
## TrainingTimesLastYear -0.129567 0.032672 -3.966 7.32e-05
## WorkLifeBalance -0.087203 0.030951 -2.817 0.004841
## YearsAtCompany 0.113734 0.067934 1.674 0.094095
## YearsInCurrentRole -0.294169 0.053191 -5.530 3.19e-08
## YearsSinceLastPromotion 0.200179 0.040968 4.886 1.03e-06
## YearsWithCurrManager -0.097849 0.053868 -1.816 0.069300
##
## (Intercept) ***
## DepartmentResearch & Development
## DepartmentSales **
## EducationFieldHuman Resources
## EducationFieldLife Sciences
## EducationFieldMarketing
## EducationFieldMedical
## EducationFieldOther
## EducationFieldTechnical Degree
## JobRoleHuman Resources Director
## JobRoleHuman Resources Executive
## JobRoleHuman Resources Manager
## JobRoleHuman Resources Representative
## JobRoleLaboratory Technician
## JobRoleManufacturing Director
## JobRoleResearch & Development Manager
## JobRoleResearch Director
## JobRoleResearch Scientist
## JobRoleSales Executive **
## JobRoleSales Manager
## JobRoleSales Representative
## Age ***
## BusinessTravelTravel_Frequently ***
## BusinessTravelTravel_Rarely ***
## DailyRate ***
## DistanceFromHome ***
## Education
## EnvironmentSatisfaction ***
## GenderMale *
## HourlyRate ***
## JobInvolvement ***
## JobLevel
## JobSatisfaction ***
## MaritalStatusMarried **
## MaritalStatusSingle ***
## MonthlyIncome
## MonthlyRate
## NumCompaniesWorked
## OverTimeYes ***
## PercentSalaryHike ***
## PerformanceRating *
## RelationshipSatisfaction .
## StockOptionLevel *
## TotalWorkingYears *
## TrainingTimesLastYear ***
## WorkLifeBalance **
## YearsAtCompany .
## YearsInCurrentRole ***
## YearsSinceLastPromotion ***
## YearsWithCurrManager .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7097.8 on 5119 degrees of freedom
## Residual deviance: 6056.2 on 5072 degrees of freedom
## AIC: 6152.2
##
## Number of Fisher Scoring iterations: 13
# Predicting on the testing set
testing_set_predictions <- predict(logistic_model, newdata = testing_set, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Converting probabilities to binary class predictions based on a threshold (usually 0.5)
predicted_classes <- ifelse(testing_set_predictions > 0.5, 1, 0)
# Create a confusion matrix
conf_mat <- confusionMatrix(as.factor(predicted_classes), as.factor(testing_set$Attrition_numeric))
# Print the confusion matrix along with statistics
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3888 321
## 1 1961 776
##
## Accuracy : 0.6715
## 95% CI : (0.6603, 0.6825)
## No Information Rate : 0.8421
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2315
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6647
## Specificity : 0.7074
## Pos Pred Value : 0.9237
## Neg Pred Value : 0.2835
## Prevalence : 0.8421
## Detection Rate : 0.5597
## Detection Prevalence : 0.6060
## Balanced Accuracy : 0.6861
##
## 'Positive' Class : 0
##
Reference: 0 represents the negative class (e.g., employees who do not leave the company). 1 represents the positive class (e.g., employees who leave the company).
Prediction: 0 represents the model’s prediction of the negative class. 1 represents the model’s prediction of the positive class.
True Positives (TP): 3888 The model correctly predicted that 3888 instances belong to the positive class (employees who leave) and indeed they belong to the positive class.
False Positives (FP): 321 The model incorrectly predicted that 321 instances belong to the positive class when they actually belong to the negative class (Type I error).
True Negatives (TN): 776 The model correctly predicted that 776 instances belong to the negative class (employees who don’t leave) and indeed they belong to the negative class.
False Negatives (FN): 1961 The model incorrectly predicted that 1961 instances belong to the negative class when they actually belong to the positive class (Type II error).
Accuracy: 0.6715
Overall proportion of correct predictions made by the model.
Precision (Positive Predictive Value): 0.9237 Proportion of instances predicted as positive by the model that are actually positive.
Recall (Sensitivity): 0.6647 Proportion of actual positive instances that were correctly predicted by the model.
Specificity: 0.7074 Proportion of actual negative instances that were correctly predicted by the model.
F1 Score: Harmonic mean of precision and recall, providing a balance between the two metrics.
Kappa: 0.2315 A statistic that measures inter-rater agreement for categorical items.
Detection Rate: 0.5597 Rate of correct predictions among all actual positive instances.
Balanced Accuracy: 0.6861 Average of sensitivity and specificity, useful for imbalanced datasets.
# Calculate AUC
roc_result <- roc(response = testing_set$Attrition_numeric, predictor = as.numeric(testing_set_predictions))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_result)
# Print AUC
print(auc_value)
## Area under the curve: 0.7416
# Calculate ROC curve
roc_result <- roc(response = testing_set$Attrition_numeric, predictor = as.numeric(testing_set_predictions))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC
auc_value <- auc(roc_result)
# Create a data frame from the roc object
roc_data <- data.frame(
TPR = roc_result$sensitivities, # True Positive Rate (Sensitivity)
FPR = roc_result$specificities, # False Positive Rate (1 - Specificity)
thresholds = roc_result$thresholds
)
# Plot ROC curve using ggplot2
ggroc_plot <- ggplot(data = roc_data, aes(x = 1 - FPR, y = TPR)) +
geom_line(color = "blue", size = 1.5) +
geom_ribbon(aes(ymin = 0, ymax = TPR), fill = "blue", alpha = 0.2) + # Adds shaded area under curve
geom_abline(linetype = "dashed", color = "red") + # Adds the reference line
ggtitle("ROC Curve with AUC") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5), # Centers the title
axis.title = element_text(size = 12), # Adjust the axis titles size
axis.text = element_text(size = 12)) + # Adjust the axis text size
xlab("False Positive Rate (1 - Specificity)") +
ylab("True Positive Rate (Sensitivity)") +
annotate("text", x = .5, y = .2, label = paste("AUC =", round(auc_value, 2)),
size = 5, color = "black", hjust = 0.5)
# Print the plot
print(ggroc_plot)
numeric_data <- training_set[, sapply(training_set, is.numeric)]
# Calculate the correlation matrix
cor_matrix <- cor(numeric_data)
# Identify highly correlated pairs
highly_correlated_pairs <- findCorrelation(cor_matrix, cutoff = 0.7, verbose = TRUE)
## Compare row 20 and column 23 with corr 0.77
## Means: 0.208 vs 0.071 so flagging column 20
## Compare row 17 and column 8 with corr 0.755
## Means: 0.175 vs 0.059 so flagging column 17
## Compare row 8 and column 10 with corr 0.942
## Means: 0.136 vs 0.048 so flagging column 8
## Compare row 23 and column 21 with corr 0.721
## Means: 0.116 vs 0.04 so flagging column 23
## All correlations <= 0.7
# Print out highly correlated pairs
print(highly_correlated_pairs)
## [1] 20 17 8 23
# Visualize the correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, tl.cex = 0.6, cl.cex = 0.6)
# Create a matrix of predictors and a response variable vector
x <- model.matrix(Attrition_numeric ~ . - 1, data = training_set) # -1 to exclude intercept
y <- training_set$Attrition_numeric
set.seed(123) # For reproducible results
cv_model <- cv.glmnet(x, y, alpha = 1, family = "binomial") # alpha = 1 for LASSO
best_lambda <- cv_model$lambda.min
final_model <- glmnet(x, y, alpha = 1, lambda = best_lambda, family = "binomial")
# Print non-zero coefficients
print(coef(final_model))
## 51 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -1.666317338
## DepartmentHuman Resources .
## DepartmentResearch & Development .
## DepartmentSales 0.166045312
## EducationFieldHuman Resources -0.144733398
## EducationFieldLife Sciences 0.006040185
## EducationFieldMarketing .
## EducationFieldMedical .
## EducationFieldOther -0.070960331
## EducationFieldTechnical Degree 0.434409154
## JobRoleHuman Resources Director 0.575760029
## JobRoleHuman Resources Executive .
## JobRoleHuman Resources Manager -3.278457019
## JobRoleHuman Resources Representative 0.027611163
## JobRoleLaboratory Technician 0.037992639
## JobRoleManufacturing Director -0.078563562
## JobRoleResearch & Development Manager -0.111320037
## JobRoleResearch Director .
## JobRoleResearch Scientist -0.060263435
## JobRoleSales Executive .
## JobRoleSales Manager 0.130773754
## JobRoleSales Representative 0.433661163
## Age -0.380537835
## BusinessTravelTravel_Frequently 1.335675624
## BusinessTravelTravel_Rarely 0.660192684
## DailyRate -0.163884846
## DistanceFromHome 0.191044681
## Education -0.003796077
## EnvironmentSatisfaction -0.187248775
## GenderMale 0.118086116
## HourlyRate 0.107827665
## JobInvolvement -0.183731252
## JobLevel -0.057569006
## JobSatisfaction -0.122963324
## MaritalStatusMarried 0.191036571
## MaritalStatusSingle 0.581002400
## MonthlyIncome -0.123923688
## MonthlyRate -0.043174994
## NumCompaniesWorked 0.022229926
## OverTimeYes 0.864794478
## PercentSalaryHike -0.125541833
## PerformanceRating 0.065587919
## RelationshipSatisfaction -0.051891822
## StockOptionLevel -0.087882185
## TotalWorkingYears 0.106687904
## TrainingTimesLastYear -0.119576273
## WorkLifeBalance -0.080160279
## YearsAtCompany 0.077442271
## YearsInCurrentRole -0.269407729
## YearsSinceLastPromotion 0.188414486
## YearsWithCurrManager -0.071073557
(Intercept): The model intercept or bias term is approximately -1.666, which is the log-odds of the outcome when all predictor variables are zero.
Coefficients Close to Zero (.): These are variables for which LASSO has effectively shrunk the coefficient to zero, suggesting that they do not contribute significantly to the model. The dot (.) is a placeholder indicating no effect after the penalty has been applied.
Positive Coefficients: Variables such as DepartmentSales (0.166) and BusinessTravelTravel_Frequently (1.336) have positive coefficients, indicating that higher values of these variables are associated with an increased log-odds of the outcome (possibly higher attrition, if Attrition_numeric = 1 represents attrition). For instance, employees in the sales department or those who travel frequently are more likely to experience the event of interest (which might be attrition in this context).
Negative Coefficients: Variables like JobRoleHuman Resources Manager (-3.278) have negative coefficients, indicating that higher values of these variables are associated with a decreased log-odds of the outcome. This might suggest that being in a human resources manager role is associated with a lower likelihood of attrition.
Large Positive Coefficients: Some coefficients like OverTimeYes (0.865) are substantially positive, which would suggest a strong association with the outcome. Employees who work overtime might have a much higher likelihood of attrition. Significant Predictors After Shrinkage: The LASSO method penalizes the absolute size of the coefficients and can drive some to zero, effectively selecting variables by removing those with negligible effects. The remaining non-zero coefficients are the variables LASSO has selected as having a more significant relationship with the outcome.
# Define all variables to be removed in one list
variables_to_remove <- c("TotalWorkingYears", "YearsWithCurrManager", "YearsInCurrentRole",
"MonthlyRate", "HourlyRate", "DailyRate", "JobLevel", "JobRole")
# Removing the specified variables from the dataset
ab_dataset_reduced <- ab_dataset_reduced[, !(names(ab_dataset_reduced) %in% variables_to_remove)]
# Confirm the removal by printing the remaining variable names
print(names(ab_dataset_reduced))
## [1] "Department" "EducationField"
## [3] "Age" "BusinessTravel"
## [5] "DistanceFromHome" "Education"
## [7] "EnvironmentSatisfaction" "Gender"
## [9] "JobInvolvement" "JobSatisfaction"
## [11] "MaritalStatus" "MonthlyIncome"
## [13] "NumCompaniesWorked" "OverTime"
## [15] "PercentSalaryHike" "PerformanceRating"
## [17] "RelationshipSatisfaction" "StockOptionLevel"
## [19] "TrainingTimesLastYear" "WorkLifeBalance"
## [21] "YearsAtCompany" "YearsSinceLastPromotion"
## [23] "Attrition_numeric"
set.seed(123) # for reproducibility
# Create a partition to split the dataset into training and testing sets
split <- createDataPartition(ab_dataset_reduced$Attrition_numeric, p = 0.7, list = FALSE)
# Create the training and testing datasets
trainingdata_reduced_ab <- ab_dataset_reduced[split, ]
testingdata_reduced_ab <- ab_dataset_reduced[-split, ]
# Balance the Attrition_numeric class in the training set
set.seed(123) # for reproducibility
trainingdata_balanced_ab <- upSample(x = trainingdata_reduced_ab[, -which(names(trainingdata_reduced_ab) == "Attrition_numeric")],
y = trainingdata_reduced_ab$Attrition_numeric)
# Check the balance of the updated training set
table(trainingdata_balanced_ab$Class)
##
## 0 1
## 13650 13650
# Fit the logistic regression model on the balanced training set
reduced_logistic_model_ab <- glm(Class ~ ., data = trainingdata_balanced_ab, family = binomial())
# Rename 'Class' back to 'Attrition_numeric'
names(trainingdata_balanced_ab)[names(trainingdata_balanced_ab) == "Class"] <- "Attrition_numeric"
# Fit the logistic regression model using 'Attrition_numeric' as the dependent variable
reduced_logistic_model_ab_updated <- glm(Attrition_numeric ~ ., data = trainingdata_balanced_ab, family = binomial())
# Print out the summary of the updated logistic regression model
summary(reduced_logistic_model_ab_updated)
##
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = trainingdata_balanced_ab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.809264 0.225487 -8.024 1.03e-15 ***
## DepartmentResearch & Development -0.012866 0.051673 -0.249 0.80337
## DepartmentSales 0.391511 0.059179 6.616 3.70e-11 ***
## EducationFieldHuman Resources 0.037908 0.252895 0.150 0.88085
## EducationFieldLife Sciences 0.013449 0.213519 0.063 0.94978
## EducationFieldMarketing 0.006694 0.219828 0.030 0.97571
## EducationFieldMedical 0.082194 0.213063 0.386 0.69966
## EducationFieldOther 0.119205 0.218913 0.545 0.58608
## EducationFieldTechnical Degree 0.432195 0.217422 1.988 0.04683 *
## Age -0.394975 0.013692 -28.847 < 2e-16 ***
## BusinessTravelTravel_Frequently 1.378974 0.056422 24.440 < 2e-16 ***
## BusinessTravelTravel_Rarely 0.729983 0.051303 14.229 < 2e-16 ***
## DistanceFromHome 0.160582 0.014489 11.083 < 2e-16 ***
## Education -0.053962 0.013532 -3.988 6.67e-05 ***
## EnvironmentSatisfaction -0.152135 0.013681 -11.120 < 2e-16 ***
## GenderMale 0.163060 0.027498 5.930 3.03e-09 ***
## JobInvolvement -0.197086 0.013126 -15.015 < 2e-16 ***
## JobSatisfaction -0.159256 0.013305 -11.970 < 2e-16 ***
## MaritalStatusMarried 0.177879 0.037270 4.773 1.82e-06 ***
## MaritalStatusSingle 0.550131 0.046698 11.781 < 2e-16 ***
## MonthlyIncome -0.126016 0.017230 -7.314 2.60e-13 ***
## NumCompaniesWorked 0.063519 0.013849 4.587 4.51e-06 ***
## OverTimeYes 0.912477 0.028540 31.972 < 2e-16 ***
## PercentSalaryHike -0.105969 0.014667 -7.225 5.00e-13 ***
## PerformanceRating 0.010503 0.014600 0.719 0.47190
## RelationshipSatisfaction -0.034949 0.013246 -2.639 0.00833 **
## StockOptionLevel -0.103958 0.017126 -6.070 1.28e-09 ***
## TrainingTimesLastYear -0.139011 0.013765 -10.099 < 2e-16 ***
## WorkLifeBalance -0.093841 0.013063 -7.184 6.79e-13 ***
## YearsAtCompany -0.050584 0.019401 -2.607 0.00913 **
## YearsSinceLastPromotion 0.097107 0.017197 5.647 1.64e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37846 on 27299 degrees of freedom
## Residual deviance: 32963 on 27269 degrees of freedom
## AIC: 33025
##
## Number of Fisher Scoring iterations: 4
# Calculate Odds Ratios
exp(coef(reduced_logistic_model_ab_updated))
## (Intercept) DepartmentResearch & Development
## 0.1637746 0.9872168
## DepartmentSales EducationFieldHuman Resources
## 1.4792136 1.0386355
## EducationFieldLife Sciences EducationFieldMarketing
## 1.0135402 1.0067160
## EducationFieldMedical EducationFieldOther
## 1.0856669 1.1266007
## EducationFieldTechnical Degree Age
## 1.5406359 0.6736967
## BusinessTravelTravel_Frequently BusinessTravelTravel_Rarely
## 3.9708247 2.0750448
## DistanceFromHome Education
## 1.1741939 0.9474680
## EnvironmentSatisfaction GenderMale
## 0.8588725 1.1771070
## JobInvolvement JobSatisfaction
## 0.8211201 0.8527779
## MaritalStatusMarried MaritalStatusSingle
## 1.1946807 1.7334793
## MonthlyIncome NumCompaniesWorked
## 0.8816012 1.0655796
## OverTimeYes PercentSalaryHike
## 2.4904832 0.8994529
## PerformanceRating RelationshipSatisfaction
## 1.0105587 0.9656546
## StockOptionLevel TrainingTimesLastYear
## 0.9012627 0.8702184
## WorkLifeBalance YearsAtCompany
## 0.9104273 0.9506743
## YearsSinceLastPromotion
## 1.1019788
(Intercept): The odds ratio for the intercept is 0.1637746. Since the intercept represents the log odds of the outcome when all predictors are held at zero, this value isn’t often used for interpretation in the context of the predictors.
DepartmentResearch & Development: The odds of the outcome (likely an event such as employee attrition, based on the context of the variable names) for employees in the Research & Development department are 0.9872168 times those of the baseline department (not shown here, but typically the one excluded from the model), holding all other variables constant. This value is very close to 1, indicating almost no change in odds compared to the baseline.
DepartmentSales: Employees in the Sales department have 1.4792136 times the odds of the outcome compared to the baseline department, holding all other variables constant. This suggests a 47.9% increase in the odds of the outcome occurring. Age: For each one-unit increase in age, the odds of the outcome decrease by a factor of 0.6736967 (or 32.6% decrease in the odds), holding other variables constant.
BusinessTravelTravel_Frequently: Employees who travel frequently have 3.9708247 times the odds of the outcome compared to those who do not travel as part of their job (likely the reference category), all else being equal. This is a substantial increase in odds.
MonthlyIncome: For each unit increase in monthly income (the scale isn’t specified here, so it could be dollars, thousands of dollars, etc.), the odds of the outcome are multiplied by 0.8816012, indicating a decrease in the odds of the outcome as monthly income increases.
OverTimeYes: Employees who work overtime have 2.4904832 times the odds of the outcome happening compared to those who do not work overtime, controlling for other factors.
MaritalStatusSingle: Employees who are single have 1.7334793 times the odds of the outcome compared to the baseline marital status (likely married or divorced, which is not listed).
# Make predictions on the testing set using the logistic regression model
testingdata_predictions_ab <- predict(reduced_logistic_model_ab_updated, newdata = testingdata_reduced_ab, type = "response")
# Convert predictions to a binary outcome based on a threshold (e.g., 0.5)
predicted_class_ab <- ifelse(testingdata_predictions_ab > 0.5, 1, 0)
# Actual values of Attrition_numeric from the testing set
actual_class_ab <- testingdata_reduced_ab$Attrition_numeric
# Confusion Matrix to see how well the model did on the testing set
confusionMatrix(data = as.factor(predicted_class_ab), reference = as.factor(actual_class_ab))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3886 347
## 1 1963 750
##
## Accuracy : 0.6674
## 95% CI : (0.6562, 0.6785)
## No Information Rate : 0.8421
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2178
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6644
## Specificity : 0.6837
## Pos Pred Value : 0.9180
## Neg Pred Value : 0.2764
## Prevalence : 0.8421
## Detection Rate : 0.5595
## Detection Prevalence : 0.6094
## Balanced Accuracy : 0.6740
##
## 'Positive' Class : 0
##
# Generate the ROC object from actual and predicted classes
roc_object <- roc(response = actual_class_ab, predictor = testingdata_predictions_ab)
# Create a data frame from the roc_object for ggplot
roc_data <- data.frame(
True_Positive_Rate = roc_object$sensitivities,
False_Positive_Rate = roc_object$specificities,
Threshold = roc_object$thresholds
)
# Calculate AUC
auc_value <- auc(roc_object)
# ROC Curve plot with ggplot2
roc_curve_plot <- ggplot(roc_data, aes(x = 1 - False_Positive_Rate, y = True_Positive_Rate)) +
geom_line(color = "#1c61b6", size = 1) +
geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), color = "red", linetype = "dotted") +
geom_text(aes(x = 0.8, y = 0.2, label = paste("AUC =", round(auc_value, 3))), hjust = 0, vjust = 0, color = "red") +
labs(title = "ROC Curve with AUC", x = "False Positive Rate (1 - Specificity)", y = "True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size = 12, face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = "white"),
legend.position = "none")
# Print the ROC Curve plot
print(roc_curve_plot)
# Split the subset data into three separate data frames, one for each department
hr_subset <- subset(ab_dataset_reduced, Department == "Human Resources")
sales_subset <- subset(ab_dataset_reduced, Department == "Sales")
rd_subset <- subset(ab_dataset_reduced, Department == "Research & Development")
# Function to check factor levels in each subset, revised to exclude non-factors and return only factors
check_factor_levels <- function(data) {
lapply(data[, sapply(data, is.factor)], function(x) length(levels(x)))
}
# Apply to each department
hr_levels <- check_factor_levels(hr_subset)
sales_levels <- check_factor_levels(sales_subset)
rd_levels <- check_factor_levels(rd_subset)
# Subset data for the Sales department from main dataset
sales_subset <- subset(ab_dataset_reduced, Department == "Sales")
# Remove the 'Department' column since it's no longer needed after filtering for Sales
sales_subset$Department <- NULL
# Remove single-level factors which can cause issues in modeling
sales_subset <- sales_subset[, sapply(sales_subset, function(x) !is.factor(x) || length(levels(x)) > 1)]
# Split data into training and testing sets
set.seed(123) # For reproducibility
index <- sample(1:nrow(sales_subset), round(0.75 * nrow(sales_subset)))
train_data <- sales_subset[index, ]
test_data_sales <- sales_subset[-index, ]
# Balancing the training dataset by downsampling the majority class
table_train <- table(train_data$Attrition_numeric)
min_class_size <- min(table_train)
# Subsetting each class to separate the majority and minority classes
train_data_0 <- train_data[train_data$Attrition_numeric == 0, ]
train_data_1 <- train_data[train_data$Attrition_numeric == 1, ]
# Sampling from the majority class to balance with the minority class
train_data_0_downsampled <- train_data_0[sample(1:nrow(train_data_0), min_class_size), ]
train_data_balanced <- rbind(train_data_0_downsampled, train_data_1)
# Shuffle rows to mix classes well, ensuring the model isn't biased towards the original order
set.seed(123)
train_data_balanced <- train_data_balanced[sample(nrow(train_data_balanced)), ]
# Fit logistic regression model
tryCatch({
sales_model <- glm(Attrition_numeric ~ ., data = train_data_balanced, family = binomial())
print(summary(sales_model))
}, error = function(e) {
print("Error fitting model:")
print(e$message)
})
##
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = train_data_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.910016 0.278052 -6.869 6.45e-12 ***
## EducationFieldMarketing 0.137648 0.126109 1.091 0.275055
## EducationFieldOther 0.286175 0.269705 1.061 0.288659
## Age -0.202272 0.059618 -3.393 0.000692 ***
## BusinessTravelTravel_Frequently 1.623971 0.245235 6.622 3.54e-11 ***
## BusinessTravelTravel_Rarely 0.790824 0.224385 3.524 0.000424 ***
## DistanceFromHome 0.378346 0.058956 6.417 1.39e-10 ***
## Education -0.187577 0.056760 -3.305 0.000951 ***
## EnvironmentSatisfaction -0.193451 0.059834 -3.233 0.001224 **
## GenderMale 0.096305 0.115787 0.832 0.405551
## JobInvolvement -0.292967 0.055331 -5.295 1.19e-07 ***
## JobSatisfaction -0.169771 0.055551 -3.056 0.002242 **
## MaritalStatusMarried -0.178187 0.161806 -1.101 0.270795
## MaritalStatusSingle 0.775332 0.202421 3.830 0.000128 ***
## MonthlyIncome -0.114129 0.083884 -1.361 0.173651
## NumCompaniesWorked 0.091511 0.050069 1.828 0.067596 .
## OverTimeYes 1.125056 0.123449 9.114 < 2e-16 ***
## PercentSalaryHike -0.280459 0.064031 -4.380 1.19e-05 ***
## PerformanceRating 0.002623 0.066168 0.040 0.968381
## RelationshipSatisfaction -0.109068 0.054675 -1.995 0.046060 *
## StockOptionLevel 0.136253 0.075686 1.800 0.071824 .
## TrainingTimesLastYear -0.021633 0.056995 -0.380 0.704272
## WorkLifeBalance -0.127593 0.058292 -2.189 0.028608 *
## YearsAtCompany -0.205858 0.084336 -2.441 0.014649 *
## YearsSinceLastPromotion 0.370418 0.070870 5.227 1.73e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2298.5 on 1657 degrees of freedom
## Residual deviance: 1920.6 on 1633 degrees of freedom
## AIC: 1970.6
##
## Number of Fisher Scoring iterations: 4
# Predict and evaluate with confusion matrix
pred_prob <- predict(sales_model, test_data_sales, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
confusion_matrix <- table(Predicted = pred_class, Actual = test_data_sales$Attrition_numeric)
print(confusion_matrix)
## Actual
## Predicted 0 1
## 0 791 106
## 1 306 185
# Calculate and print accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.703170028818444"
# Check for NAs or unusual values in the test dataset
summary(test_data_sales)
## EducationField Age BusinessTravel
## 0 : 0 Min. :-2.07560 Non-Travel :149
## Human Resources : 0 1st Qu.:-0.75885 Travel_Frequently:275
## Life Sciences :894 Median :-0.21021 Travel_Rarely :964
## Marketing :415 Mean :-0.04735
## Medical : 0 3rd Qu.: 0.55789
## Other : 79 Max. : 2.53301
## Technical Degree: 0
## DistanceFromHome Education EnvironmentSatisfaction Gender
## Min. :-1.01127 Min. :-2.11567 Min. :-1.57109 Female:587
## 1st Qu.:-0.88779 1st Qu.:-1.08579 1st Qu.:-0.65761 Male :801
## Median :-0.14691 Median :-0.05591 Median : 0.25586
## Mean : 0.05352 Mean :-0.07149 Mean : 0.02815
## 3rd Qu.: 0.59397 3rd Qu.: 0.97398 3rd Qu.: 1.16934
## Max. : 2.44617 Max. : 2.00386 Max. : 1.16934
##
## JobInvolvement JobSatisfaction MaritalStatus MonthlyIncome
## Min. :-2.43834 Min. :-1.56854 Divorced:282 Min. :-1.15914
## 1st Qu.:-1.02971 1st Qu.:-0.66049 Married :635 1st Qu.:-0.49243
## Median : 0.37892 Median : 0.24756 Single :471 Median :-0.18585
## Mean :-0.01484 Mean : 0.02055 Mean : 0.04991
## 3rd Qu.: 0.37892 3rd Qu.: 1.15561 3rd Qu.: 0.45068
## Max. : 1.78756 Max. : 1.15561 Max. : 2.83429
##
## NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating
## Min. :-1.0760 No :977 Min. :-1.15182 Min. :-0.42862
## 1st Qu.:-0.6758 Yes:411 1st Qu.:-0.87854 1st Qu.:-0.42862
## Median : 0.5249 Median :-0.33197 Median :-0.42862
## Mean : 0.4534 Mean :-0.03408 Mean :-0.03468
## 3rd Qu.: 1.3253 3rd Qu.: 0.76116 3rd Qu.:-0.42862
## Max. : 2.5260 Max. : 2.67414 Max. : 2.33296
##
## RelationshipSatisfaction StockOptionLevel TrainingTimesLastYear
## Min. :-1.58641 Min. :-0.933639 Min. :-2.16986
## 1st Qu.:-0.66086 1st Qu.:-0.933639 1st Qu.:-0.61951
## Median : 0.26468 Median : 0.241517 Median : 0.15566
## Mean :-0.01872 Mean :-0.004013 Mean : 0.03782
## 3rd Qu.: 1.19023 3rd Qu.: 0.241517 3rd Qu.: 0.15566
## Max. : 1.19023 Max. : 2.591829 Max. : 2.48119
##
## WorkLifeBalance YearsAtCompany YearsSinceLastPromotion
## Min. :-2.49813 Min. :-1.14335 Min. :-0.67867
## 1st Qu.:-1.08035 1st Qu.:-0.65445 1st Qu.:-0.67867
## Median : 0.33742 Median :-0.16555 Median :-0.36759
## Mean : 0.08819 Mean : 0.05835 Mean : 0.03246
## 3rd Qu.: 0.33742 3rd Qu.: 0.48632 3rd Qu.: 0.25456
## Max. : 1.75520 Max. : 4.88641 Max. : 3.98749
##
## Attrition_numeric
## 0:1097
## 1: 291
##
##
##
##
##
# Predict and create the confusion matrix using the caret package
predictions <- predict(sales_model, test_data_sales, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
conf_matrix <- confusionMatrix(factor(predicted_classes), factor(test_data_sales$Attrition_numeric))
# Print the detailed confusion matrix and associated statistics
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 791 106
## 1 306 185
##
## Accuracy : 0.7032
## 95% CI : (0.6784, 0.7271)
## No Information Rate : 0.7903
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2849
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7211
## Specificity : 0.6357
## Pos Pred Value : 0.8818
## Neg Pred Value : 0.3768
## Prevalence : 0.7903
## Detection Rate : 0.5699
## Detection Prevalence : 0.6463
## Balanced Accuracy : 0.6784
##
## 'Positive' Class : 0
##
## [1] "Number of rows in test dataset: 1388"
## [1] "Length of prediction vector: 1388"
## [1] "AUC: 0.744351198363548"
# Subset data for the Research & Development department from the correct dataset
rd_subset <- subset(ab_dataset_reduced, Department == "Research & Development")
# Remove the 'Department' column as it is not needed after the subset by department
rd_subset$Department <- NULL
# Remove single-level factors which can cause issues in modeling
rd_subset <- rd_subset[, sapply(rd_subset, function(x) !is.factor(x) || length(levels(x)) > 1)]
# Split data into training and testing sets
set.seed(123) # Set seed for reproducibility
index <- sample(1:nrow(rd_subset), round(0.75 * nrow(rd_subset)))
train_data <- rd_subset[index, ]
test_data_rd <- rd_subset[-index, ]
# Balancing the training dataset by downsampling the majority class
table_train <- table(train_data$Attrition_numeric)
min_class_size <- min(table_train)
# Subsetting each class to separate the majority and minority classes
train_data_0 <- train_data[train_data$Attrition_numeric == 0, ]
train_data_1 <- train_data[train_data$Attrition_numeric == 1, ]
# Sampling from the majority class to balance with the minority class
train_data_0_downsampled <- train_data_0[sample(1:nrow(train_data_0), min_class_size), ]
train_data_balanced <- rbind(train_data_0_downsampled, train_data_1)
# Shuffle rows to mix classes well, ensuring the model isn't biased towards the original order
set.seed(123)
train_data_balanced <- train_data_balanced[sample(nrow(train_data_balanced)), ]
# Fit logistic regression model
tryCatch({
rd_model <- glm(Attrition_numeric ~ ., data = train_data_balanced, family = binomial())
print(summary(rd_model))
}, error = function(e) {
print("Error fitting model:")
print(e$message)
})
##
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = train_data_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.709372 0.554885 -4.883 1.05e-06 ***
## EducationFieldLife Sciences 0.900856 0.535904 1.681 0.092762 .
## EducationFieldMedical 0.909116 0.533733 1.703 0.088509 .
## EducationFieldOther 0.651873 0.558317 1.168 0.242981
## EducationFieldTechnical Degree 1.265953 0.543632 2.329 0.019875 *
## Age -0.438291 0.038596 -11.356 < 2e-16 ***
## BusinessTravelTravel_Frequently 1.237428 0.168963 7.324 2.41e-13 ***
## BusinessTravelTravel_Rarely 0.711752 0.154845 4.597 4.30e-06 ***
## DistanceFromHome 0.084496 0.043180 1.957 0.050369 .
## Education -0.030262 0.039970 -0.757 0.448972
## EnvironmentSatisfaction -0.072961 0.039429 -1.850 0.064250 .
## GenderMale 0.290066 0.079730 3.638 0.000275 ***
## JobInvolvement -0.206738 0.038491 -5.371 7.83e-08 ***
## JobSatisfaction -0.218284 0.039038 -5.592 2.25e-08 ***
## MaritalStatusMarried 0.234966 0.110855 2.120 0.034041 *
## MaritalStatusSingle 0.755564 0.139087 5.432 5.56e-08 ***
## MonthlyIncome -0.130092 0.046426 -2.802 0.005077 **
## NumCompaniesWorked -0.003873 0.044468 -0.087 0.930600
## OverTimeYes 0.870294 0.082148 10.594 < 2e-16 ***
## PercentSalaryHike 0.004104 0.041492 0.099 0.921218
## PerformanceRating 0.007081 0.041934 0.169 0.865909
## RelationshipSatisfaction -0.041873 0.038481 -1.088 0.276526
## StockOptionLevel -0.101840 0.050779 -2.006 0.044904 *
## TrainingTimesLastYear -0.177414 0.039166 -4.530 5.91e-06 ***
## WorkLifeBalance -0.078901 0.038435 -2.053 0.040087 *
## YearsAtCompany -0.044004 0.055136 -0.798 0.424813
## YearsSinceLastPromotion -0.020841 0.051443 -0.405 0.685389
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4585.9 on 3307 degrees of freedom
## Residual deviance: 3972.3 on 3281 degrees of freedom
## AIC: 4026.3
##
## Number of Fisher Scoring iterations: 3
# Predict and evaluate with a confusion matrix
pred_prob <- predict(rd_model, test_data_rd, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
confusion_matrix <- table(Predicted = pred_class, Actual = test_data_rd$Attrition_numeric)
print(confusion_matrix)
## Actual
## Predicted 0 1
## 0 2182 187
## 1 1100 382
# Calculate and print accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.665801090625811"
# Predict and create the confusion matrix using the caret package
predictions_rd <- predict(rd_model, test_data_rd, type = "response")
predicted_classes_rd <- ifelse(predictions_rd > 0.5, 1, 0)
conf_matrix_rd <- confusionMatrix(factor(predicted_classes_rd), factor(test_data_rd$Attrition_numeric))
# Print the detailed confusion matrix and associated statistics
print(conf_matrix_rd)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2182 187
## 1 1100 382
##
## Accuracy : 0.6658
## 95% CI : (0.6507, 0.6807)
## No Information Rate : 0.8522
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2021
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6648
## Specificity : 0.6714
## Pos Pred Value : 0.9211
## Neg Pred Value : 0.2578
## Prevalence : 0.8522
## Detection Rate : 0.5666
## Detection Prevalence : 0.6152
## Balanced Accuracy : 0.6681
##
## 'Positive' Class : 0
##
# Predict probabilities for the test set
pred_prob <- NULL
tryCatch({
pred_prob <- predict(rd_model, newdata = test_data_rd, type = "response")
}, error = function(e) {
print("Error in prediction:")
print(e$message)
})
# Only proceed if pred_prob has been successfully created
if (!is.null(pred_prob) && length(pred_prob) == length(test_data_rd$Attrition_numeric)) {
library(pROC)
# Generate ROC curve
roc_curve <- roc(response = test_data_rd$Attrition_numeric, predictor = pred_prob)
# Plot ROC curve
plot(roc_curve, main = "ROC Curve for R&D Logistic Model")
abline(a = 0, b = 1, col = "red", lty = 2) # Diagonal line for reference
# Calculate AUC
auc_value <- auc(roc_curve)
print(paste("AUC:", auc_value))
# Optional: Add AUC to the plot
legend("bottomright", legend = paste("AUC =", round(auc_value, 4)), box.lty = 1, col = "black", bty = "n")
} else {
print("Error: Unable to generate ROC curve due to previous error or mismatch in data lengths.")
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [1] "AUC: 0.725022463691284"
# Subset data for the Human Resources department from the updated dataset name
hr_subset <- subset(ab_dataset_reduced, Department == "Human Resources")
# Remove the 'Department' column as it is no longer needed after the subset by department
hr_subset$Department <- NULL
# Remove single-level factors which can cause issues in modeling
hr_subset <- hr_subset[, sapply(hr_subset, function(x) !is.factor(x) || length(levels(x)) > 1)]
# Split data into training and testing sets
set.seed(123) # Set seed for reproducibility
index <- sample(1:nrow(hr_subset), round(0.75 * nrow(hr_subset)))
train_data <- hr_subset[index, ]
test_data_hr <- hr_subset[-index, ]
# Balancing the training dataset by downsampling the majority class
table_train <- table(train_data$Attrition_numeric)
min_class_size <- min(table_train)
# Subsetting each class to separate the majority and minority classes
train_data_0 <- train_data[train_data$Attrition_numeric == 0, ]
train_data_1 <- train_data[train_data$Attrition_numeric == 1, ]
# Sampling from the majority class to balance with the minority class
train_data_0_downsampled <- train_data_0[sample(1:nrow(train_data_0), min_class_size), ]
train_data_balanced <- rbind(train_data_0_downsampled, train_data_1)
# Shuffle rows to mix classes well, ensuring the model isn't biased towards the original order
set.seed(123)
train_data_balanced <- train_data_balanced[sample(nrow(train_data_balanced)), ]
# Fit logistic regression model
tryCatch({
hr_model <- glm(Attrition_numeric ~ ., data = train_data_balanced, family = binomial())
print(summary(hr_model))
}, error = function(e) {
print("Error fitting model:")
print(e$message)
})
##
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = train_data_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.66261 0.75415 -0.879 0.379609
## EducationFieldLife Sciences -0.73767 0.40108 -1.839 0.065880 .
## EducationFieldMarketing -0.72497 0.66125 -1.096 0.272916
## EducationFieldMedical 0.34875 0.46547 0.749 0.453713
## EducationFieldOther 0.44066 0.54978 0.802 0.422822
## EducationFieldTechnical Degree -0.11687 0.66201 -0.177 0.859876
## Age -0.78064 0.14690 -5.314 1.07e-07 ***
## BusinessTravelTravel_Frequently 1.34156 0.64306 2.086 0.036960 *
## BusinessTravelTravel_Rarely 1.01639 0.60731 1.674 0.094210 .
## DistanceFromHome 0.30631 0.14073 2.177 0.029508 *
## Education 0.47500 0.13697 3.468 0.000525 ***
## EnvironmentSatisfaction -0.26257 0.11520 -2.279 0.022659 *
## GenderMale -0.25698 0.26559 -0.968 0.333250
## JobInvolvement -0.16369 0.13484 -1.214 0.224775
## JobSatisfaction 0.01171 0.12541 0.093 0.925614
## MaritalStatusMarried -0.48239 0.33553 -1.438 0.150522
## MaritalStatusSingle -0.52188 0.45337 -1.151 0.249687
## MonthlyIncome -0.50460 0.17893 -2.820 0.004800 **
## NumCompaniesWorked -0.20397 0.12112 -1.684 0.092183 .
## OverTimeYes 0.34287 0.26691 1.285 0.198930
## PercentSalaryHike -0.43075 0.14070 -3.062 0.002202 **
## PerformanceRating 0.24800 0.14072 1.762 0.078005 .
## RelationshipSatisfaction -0.20449 0.13423 -1.523 0.127656
## StockOptionLevel -0.25464 0.15249 -1.670 0.094933 .
## TrainingTimesLastYear -0.60331 0.15039 -4.012 6.03e-05 ***
## WorkLifeBalance 0.07025 0.13359 0.526 0.598949
## YearsAtCompany 0.31387 0.16130 1.946 0.051673 .
## YearsSinceLastPromotion 0.25176 0.16435 1.532 0.125557
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 634.92 on 457 degrees of freedom
## Residual deviance: 461.35 on 430 degrees of freedom
## AIC: 517.35
##
## Number of Fisher Scoring iterations: 5
# Predict and evaluate with a confusion matrix
pred_prob <- predict(hr_model, test_data_hr, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
confusion_matrix <- table(Predicted = pred_class, Actual = test_data_hr$Attrition_numeric)
print(confusion_matrix)
## Actual
## Predicted 0 1
## 0 325 26
## 1 139 59
# Calculate and print accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.699453551912568"
# Predict and create the confusion matrix using the caret package
predictions_hr <- predict(hr_model, test_data_hr, type = "response")
predicted_classes_hr <- ifelse(predictions_hr > 0.5, 1, 0)
conf_matrix_hr <- confusionMatrix(factor(predicted_classes_hr), factor(test_data_hr$Attrition_numeric))
# Print the detailed confusion matrix and associated statistics
print(conf_matrix_hr)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 325 26
## 1 139 59
##
## Accuracy : 0.6995
## 95% CI : (0.6592, 0.7376)
## No Information Rate : 0.8452
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2557
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7004
## Specificity : 0.6941
## Pos Pred Value : 0.9259
## Neg Pred Value : 0.2980
## Prevalence : 0.8452
## Detection Rate : 0.5920
## Detection Prevalence : 0.6393
## Balanced Accuracy : 0.6973
##
## 'Positive' Class : 0
##
# Fit logistic regression model
tryCatch({
hr_model <- glm(Attrition_numeric ~ ., data = train_data_balanced, family = binomial())
}, error = function(e) {
print("Error in model fitting:")
print(e$message)
})
# Make predictions on the test dataset
tryCatch({
pred_prob <- predict(hr_model, newdata = test_data_hr, type = "response")
}, error = function(e) {
print("Error in prediction:")
print(e$message)
})
# Re-check the lengths
print(paste("Re-checked length of actual test responses:", length(test_data_hr$Attrition_numeric)))
## [1] "Re-checked length of actual test responses: 549"
print(paste("Re-checked length of predicted probabilities:", length(pred_prob)))
## [1] "Re-checked length of predicted probabilities: 549"
# Only proceed if lengths match
if (length(test_data_hr$Attrition_numeric) == length(pred_prob)) {
library(pROC)
# Generate ROC curve
roc_curve <- roc(response = test_data_hr$Attrition_numeric, predictor = pred_prob)
# Plot ROC curve
plot(roc_curve, main = "ROC Curve for HR Department Logistic Regression Model")
abline(a = 0, b = 1, col = "red") # Diagonal reference line
# Calculate AUC
auc_value <- auc(roc_curve)
print(paste("AUC:", auc_value))
# Optional: Add AUC to the plot
legend("bottomright", legend = paste("AUC =", round(auc_value, 4)), box.lty = 1, col = "black", bty = "n")
} else {
print("Length mismatch between test responses and predictions. Unable to generate ROC curve.")
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [1] "AUC: 0.784888438133874"
# Define a function to summarize and print model results
print_model_summary <- function(department, model, test_data) {
cat("\n----------------------------------------\n")
cat("Department:", department, "\n")
pred_prob <- predict(model, test_data, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
confusion_matrix <- table(Predicted = pred_class, Actual = test_data$Attrition_numeric)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Confusion Matrix:\n")
print(confusion_matrix)
cat("Accuracy:", sprintf("%.2f%%", accuracy * 100), "\n")
}
# Print summaries
print_model_summary("Sales", sales_model, test_data_sales)
##
## ----------------------------------------
## Department: Sales
## Confusion Matrix:
## Actual
## Predicted 0 1
## 0 791 106
## 1 306 185
## Accuracy: 70.32%
print_model_summary("Research & Development", rd_model, test_data_rd)
##
## ----------------------------------------
## Department: Research & Development
## Confusion Matrix:
## Actual
## Predicted 0 1
## 0 2182 187
## 1 1100 382
## Accuracy: 66.58%
print_model_summary("Human Resources", hr_model, test_data_hr)
##
## ----------------------------------------
## Department: Human Resources
## Confusion Matrix:
## Actual
## Predicted 0 1
## 0 325 26
## 1 139 59
## Accuracy: 69.95%
data <- ab_dataset_reduced
# Calculate propensity scores
ps_model <- glm(Attrition_numeric ~ ., data = trainingdata_balanced_ab, family = "binomial")
# Perform matching
match_data <- matchit(Attrition_numeric ~ ., data = trainingdata_balanced_ab, method = "nearest", model = ps_model)
# Extract matched data
matched_data <- match.data(match_data)
set.seed(123) # For reproducibility
split <- sample.split(matched_data$Attrition_numeric, SplitRatio = 0.7)
train_data <- subset(matched_data, split == TRUE)
test_data <- subset(matched_data, split == FALSE)
# Fit logistic regression on the balanced training data
reduced_model <- glm(Attrition_numeric ~ .,
data = trainingdata_balanced_ab, family = binomial())
# Calculate VIF to check for multicollinearity
vif(reduced_model) # A VIF value greater than 5 is typically cause for concern
## GVIF Df GVIF^(1/(2*Df))
## Department 2.132924 2 1.208492
## EducationField 2.535214 6 1.080607
## Age 1.082284 1 1.040329
## BusinessTravel 1.044822 2 1.011022
## DistanceFromHome 1.230991 1 1.109500
## Education 1.035204 1 1.017450
## EnvironmentSatisfaction 1.085291 1 1.041773
## Gender 1.026220 1 1.013025
## JobInvolvement 1.014915 1 1.007430
## JobSatisfaction 1.021082 1 1.010486
## MaritalStatus 1.734611 2 1.147626
## MonthlyIncome 1.474362 1 1.214233
## NumCompaniesWorked 1.139318 1 1.067388
## OverTime 1.034080 1 1.016897
## PercentSalaryHike 1.212110 1 1.100959
## PerformanceRating 1.191801 1 1.091696
## RelationshipSatisfaction 1.017034 1 1.008481
## StockOptionLevel 1.693762 1 1.301446
## TrainingTimesLastYear 1.027627 1 1.013719
## WorkLifeBalance 1.016379 1 1.008156
## YearsAtCompany 2.043510 1 1.429514
## YearsSinceLastPromotion 1.657952 1 1.287615
# Predict using the reduced model
predicted_probabilities <- predict(reduced_model, newdata = testingdata_reduced_ab, type = "response")
predicted_classes <- ifelse(predicted_probabilities > 0.5, 1, 0)
# Generate a confusion matrix to evaluate the model
confusionMatrix(as.factor(predicted_classes), as.factor(testingdata_reduced_ab$Attrition_numeric))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3886 347
## 1 1963 750
##
## Accuracy : 0.6674
## 95% CI : (0.6562, 0.6785)
## No Information Rate : 0.8421
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2178
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6644
## Specificity : 0.6837
## Pos Pred Value : 0.9180
## Neg Pred Value : 0.2764
## Prevalence : 0.8421
## Detection Rate : 0.5595
## Detection Prevalence : 0.6094
## Balanced Accuracy : 0.6740
##
## 'Positive' Class : 0
##
# Set seed for reproducibility
set.seed(123)
# Splitting the dataset into training and testing sets (70:30)
split <- createDataPartition(training_set$Attrition_numeric, p = 0.7, list = FALSE, times = 1)
trainingset_interaction <- training_set[split, ]
testset_interaction <- training_set[-split, ]
# Balancing the 'Attrition_numeric' class in the training set using upSample
balanceddata_interaction <- upSample(x = trainingset_interaction[, names(trainingset_interaction) != "Attrition_numeric"],
y = trainingset_interaction$Attrition_numeric)
# Check if the target variable is in the dataset
# Since upSample does not change the target variable name, no renaming is needed
balanceddata_interaction$Attrition_numeric <- as.factor(balanceddata_interaction$Class)
balanceddata_interaction$Class <- NULL # Clean up, just in case
# Fit logistic regression model on balanced training set
logistic_model_interaction <- glm(Attrition_numeric ~ Department + EducationField + Age +
BusinessTravel + DailyRate + DistanceFromHome + Education +
EnvironmentSatisfaction + Gender + HourlyRate + JobInvolvement +
JobSatisfaction + MaritalStatus + MonthlyRate + NumCompaniesWorked +
OverTime + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction +
StockOptionLevel + TrainingTimesLastYear + WorkLifeBalance +
YearsSinceLastPromotion + YearsWithCurrManager +
MonthlyIncome * JobLevel + TotalWorkingYears * JobLevel +
YearsInCurrentRole * YearsAtCompany,
data = balanceddata_interaction, family = binomial())
# Print out the results of the logistic regression model
summary(logistic_model_interaction)
##
## Call:
## glm(formula = Attrition_numeric ~ Department + EducationField +
## Age + BusinessTravel + DailyRate + DistanceFromHome + Education +
## EnvironmentSatisfaction + Gender + HourlyRate + JobInvolvement +
## JobSatisfaction + MaritalStatus + MonthlyRate + NumCompaniesWorked +
## OverTime + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction +
## StockOptionLevel + TrainingTimesLastYear + WorkLifeBalance +
## YearsSinceLastPromotion + YearsWithCurrManager + MonthlyIncome *
## JobLevel + TotalWorkingYears * JobLevel + YearsInCurrentRole *
## YearsAtCompany, family = binomial(), data = balanceddata_interaction)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.4862341 0.6400046 -3.885 0.000102 ***
## DepartmentResearch & Development 0.0758041 0.1507507 0.503 0.615074
## DepartmentSales 0.5766002 0.1702553 3.387 0.000707 ***
## EducationFieldHuman Resources 0.7813142 0.7140475 1.094 0.273865
## EducationFieldLife Sciences 0.5862909 0.6036344 0.971 0.331415
## EducationFieldMarketing 0.3417749 0.6211637 0.550 0.582170
## EducationFieldMedical 0.5814716 0.6022051 0.966 0.334259
## EducationFieldOther 0.5755311 0.6188361 0.930 0.352360
## EducationFieldTechnical Degree 1.0251029 0.6150993 1.667 0.095601 .
## Age -0.4074443 0.0391830 -10.398 < 2e-16 ***
## BusinessTravelTravel_Frequently 1.4106405 0.1579973 8.928 < 2e-16 ***
## BusinessTravelTravel_Rarely 0.6633699 0.1426172 4.651 3.30e-06 ***
## DailyRate -0.1841439 0.0377842 -4.874 1.10e-06 ***
## DistanceFromHome 0.1252703 0.0415807 3.013 0.002589 **
## Education 0.0008085 0.0378953 0.021 0.982978
## EnvironmentSatisfaction -0.1984853 0.0415693 -4.775 1.80e-06 ***
## GenderMale 0.1278592 0.0779162 1.641 0.100801
## HourlyRate 0.1461968 0.0423125 3.455 0.000550 ***
## JobInvolvement -0.1804064 0.0361011 -4.997 5.82e-07 ***
## JobSatisfaction -0.1237468 0.0369841 -3.346 0.000820 ***
## MaritalStatusMarried 0.1937180 0.1055471 1.835 0.066451 .
## MaritalStatusSingle 0.5668393 0.1322220 4.287 1.81e-05 ***
## MonthlyRate -0.0635002 0.0382853 -1.659 0.097195 .
## NumCompaniesWorked 0.0053192 0.0397640 0.134 0.893584
## OverTimeYes 0.9370381 0.0807609 11.603 < 2e-16 ***
## PercentSalaryHike -0.1197368 0.0420129 -2.850 0.004372 **
## PerformanceRating 0.0826448 0.0429547 1.924 0.054355 .
## RelationshipSatisfaction -0.0625610 0.0373027 -1.677 0.093519 .
## StockOptionLevel -0.0927660 0.0483766 -1.918 0.055164 .
## TrainingTimesLastYear -0.1425336 0.0391973 -3.636 0.000277 ***
## WorkLifeBalance -0.0695363 0.0367658 -1.891 0.058580 .
## YearsSinceLastPromotion 0.1747860 0.0479935 3.642 0.000271 ***
## YearsWithCurrManager -0.0831406 0.0700081 -1.188 0.234996
## MonthlyIncome -0.1216715 0.1313998 -0.926 0.354465
## JobLevel -0.1515737 0.1206870 -1.256 0.209144
## TotalWorkingYears 0.1712470 0.0700922 2.443 0.014559 *
## YearsInCurrentRole -0.3204637 0.0672382 -4.766 1.88e-06 ***
## YearsAtCompany 0.0199024 0.0834474 0.239 0.811491
## MonthlyIncome:JobLevel 0.0420247 0.0589718 0.713 0.476079
## JobLevel:TotalWorkingYears -0.0247818 0.0609996 -0.406 0.684550
## YearsInCurrentRole:YearsAtCompany 0.0640496 0.0380298 1.684 0.092144 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4968.5 on 3583 degrees of freedom
## Residual deviance: 4229.8 on 3543 degrees of freedom
## AIC: 4311.8
##
## Number of Fisher Scoring iterations: 3
# Predicting on the test set
predictions <- predict(logistic_model_interaction, newdata = testset_interaction, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
# Evaluate model performance
confusionMatrix <- table(Predicted = predicted_classes, Actual = testset_interaction$Attrition_numeric)
accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)
# Print the confusion matrix and accuracy
print(confusionMatrix)
## Actual
## Predicted 0 1
## 0 502 236
## 1 266 532
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.673177083333333"
OverTimeYes (Coefficient: 0.9370, p-value: < 2e-16) Most significant predictor of attrition. Employees working overtime are more likely to leave. Age (Coefficient: -0.4074, p-value: < 2e-16) Significant negative predictor. Older employees are less likely to leave. BusinessTravelTravel_Frequently (Coefficient: 1.4106, p-value: < 2e-16) Frequent business travelers are more likely to leave. DailyRate (Coefficient: -0.1841, p-value: 1.10e-06) Higher daily rates are associated with lower attrition. EnvironmentSatisfaction (Coefficient: -0.1985, p-value: 1.80e-06) Higher satisfaction leads to lower attrition. BusinessTravelTravel_Rarely (Coefficient: 0.6634, p-value: 3.30e-06) Less frequent travel still predicts higher attrition compared to no travel. YearsInCurrentRole (Coefficient: -0.3205, p-value: 1.88e-06) Longer time in current role associates with lower attrition. JobInvolvement (Coefficient: -0.1804, p-value: 5.82e-07) Higher job involvement is linked to lower attrition. JobSatisfaction (Coefficient: -0.1237, p-value: 0.000820) Higher job satisfaction reduces attrition. DistanceFromHome (Coefficient: 0.1253, p-value: 0.002589) Longer commutes increase the likelihood of leaving.
# Convert predicted_classes and actual values to factors to ensure compatibility with confusionMatrix function
predicted_factors <- factor(predicted_classes, levels = c(0, 1))
actual_factors <- factor(testset_interaction$Attrition_numeric, levels = c(0, 1))
evaluation_results <- confusionMatrix(predicted_factors, actual_factors)
# Print the evaluation results which include confusion matrix, overall statistics, and class statistics
print(evaluation_results)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 502 236
## 1 266 532
##
## Accuracy : 0.6732
## 95% CI : (0.6491, 0.6966)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3464
##
## Mcnemar's Test P-Value : 0.1956
##
## Sensitivity : 0.6536
## Specificity : 0.6927
## Pos Pred Value : 0.6802
## Neg Pred Value : 0.6667
## Prevalence : 0.5000
## Detection Rate : 0.3268
## Detection Prevalence : 0.4805
## Balanced Accuracy : 0.6732
##
## 'Positive' Class : 0
##
library(ggplot2)
library(dplyr)
model_summary <- summary(logistic_model_interaction)
# Create a data frame of variables, their coefficients, and p-values
coefficients <- model_summary$coefficients
data <- as.data.frame(coefficients)
data$Variable <- rownames(coefficients)
rownames(data) <- NULL # Clean up row names
# Add direction based on the sign of the coefficient
data$Direction <- ifelse(data$Estimate > 0, "Positive", "Negative")
# Remove the intercept and split the data frame into positive and negative coefficients
positive_coeffs <- data %>%
filter(Estimate > 0 & Variable != "(Intercept)") %>%
arrange(desc(Estimate))
negative_coeffs <- data %>%
filter(Estimate < 0 & Variable != "(Intercept)") %>%
arrange(Estimate)
# Define colors for direction
positive_color <- "#EF553B"
negative_color <- "#00CC96"
# Function to clean up the plot look
clean_theme <- function(p) {
p + theme_minimal() +
theme(
text = element_text(size = 16),
axis.title.y = element_text(size = 18),
axis.text.y = element_text(size = 12),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.position = "none"
)
}
# Function to add coefficient labels inside the bars
add_coefficient_labels <- function(p, data) {
p + geom_text(
data = data,
aes(label = sprintf("%.2f", Estimate), x = reorder(Variable, Estimate), y = ifelse(Direction == "Positive", Estimate - 0.05 * max(data$Estimate), Estimate + 0.05 * abs(min(data$Estimate)))),
size = 3,
hjust = ifelse(data$Direction == "Positive", 1.1, -0.1),
color = "white"
)
}
# Create the positive coefficients bar plot
p1 <- ggplot(positive_coeffs, aes(x = reorder(Variable, Estimate), y = Estimate, fill = Direction)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = positive_color) +
labs(title = "Positive Coefficients for Attrition")
# Create the negative coefficients bar plot
p2 <- ggplot(negative_coeffs, aes(x = reorder(Variable, -Estimate), y = Estimate, fill = Direction)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = negative_color) +
labs(title = "Negative Coefficients for Attrition")
# Apply the custom theme to the plot
p1 <- clean_theme(p1)
p1 <- add_coefficient_labels(p1, positive_coeffs)
# Apply the custom theme to the plot
p2 <- clean_theme(p2)
p2 <- add_coefficient_labels(p2, negative_coeffs)
# Print the plots
print(p1)
print(p2)
# Convert predicted_classes and actual values to factors to ensure compatibility with confusionMatrix function
predicted_factors <- factor(predicted_classes, levels = c(0, 1))
actual_factors <- factor(testset_interaction$Attrition_numeric, levels = c(0, 1))
# Load the caret package if you haven't already
library(caret)
evaluation_results <- confusionMatrix(predicted_factors, actual_factors)
# Print the evaluation results which include confusion matrix, overall statistics, and class statistics
print(evaluation_results)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 502 236
## 1 266 532
##
## Accuracy : 0.6732
## 95% CI : (0.6491, 0.6966)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3464
##
## Mcnemar's Test P-Value : 0.1956
##
## Sensitivity : 0.6536
## Specificity : 0.6927
## Pos Pred Value : 0.6802
## Neg Pred Value : 0.6667
## Prevalence : 0.5000
## Detection Rate : 0.3268
## Detection Prevalence : 0.4805
## Balanced Accuracy : 0.6732
##
## 'Positive' Class : 0
##
# Building the decision tree model
decision_tree_model <- rpart(Attrition_numeric ~ .,
data = trainingdata_balanced_ab,
method = "class")
# Predicting using the decision tree model
tree_predictions <- predict(decision_tree_model, newdata = testingdata_reduced_ab, type = "class")
# Convert predictions to factor to ensure compatibility with confusionMatrix function
# It's essential that the levels match those in the actual outcome variable
tree_predictions_factor <- factor(tree_predictions, levels = levels(testingdata_reduced_ab$Attrition_numeric))
# Evaluation using confusion matrix
tree_evaluation_results <- confusionMatrix(tree_predictions_factor, testingdata_reduced_ab$Attrition_numeric)
# Print the evaluation results
print(tree_evaluation_results)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4120 390
## 1 1729 707
##
## Accuracy : 0.6949
## 95% CI : (0.684, 0.7057)
## No Information Rate : 0.8421
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2332
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7044
## Specificity : 0.6445
## Pos Pred Value : 0.9135
## Neg Pred Value : 0.2902
## Prevalence : 0.8421
## Detection Rate : 0.5931
## Detection Prevalence : 0.6493
## Balanced Accuracy : 0.6744
##
## 'Positive' Class : 0
##
# Set up cross-validation settings
train_control <- trainControl(
method = "cv", # Use k-fold cross-validation
number = 5, # Number of folds in k-fold cross-validation
savePredictions = "final",
classProbs = TRUE # Store class probabilities (useful for ROC, AUC)
)
# Check current levels of the factor
levels(trainingdata_balanced_ab$Attrition_numeric)
## [1] "0" "1"
# Adjust the levels to be valid R variable names
levels(trainingdata_balanced_ab$Attrition_numeric) <- make.names(levels(trainingdata_balanced_ab$Attrition_numeric))
# Verify the changes
levels(trainingdata_balanced_ab$Attrition_numeric)
## [1] "X0" "X1"
# Set up cross-validation settings
train_control <- trainControl(
method = "cv", # Use k-fold cross-validation
number = 10, # Number of folds in k-fold cross-validation
savePredictions = "final",
classProbs = TRUE # Store class probabilities
)
# Train the model with cross-validation
set.seed(123) # For reproducibility
decision_tree_model_cv <- train(
Attrition_numeric ~ ., # Formula
data = trainingdata_balanced_ab, # Training data
method = "rpart", # Training method (decision tree)
trControl = train_control, # Control object
metric = "Accuracy" # Performance metric
)
# Print the model summary
print(decision_tree_model_cv)
## CART
##
## 27300 samples
## 22 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 24570, 24570, 24570, 24570, 24570, 24570, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01978022 0.6584982 0.3169963
## 0.02849817 0.6279121 0.2558242
## 0.23626374 0.5683883 0.1367766
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01978022.
# Extract predictions and observed values from the model
predictions <- decision_tree_model_cv$pred
# Select the final predicted classes
final_predictions <- predictions$pred
# The observed values
observed_values <- predictions$obs
# Generate the confusion matrix
conf_matrix <- confusionMatrix(data = factor(final_predictions, levels = c("X0", "X1")),
reference = factor(observed_values, levels = c("X0", "X1")))
# Print the confusion matrix
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 9504 5177
## X1 4146 8473
##
## Accuracy : 0.6585
## 95% CI : (0.6528, 0.6641)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.317
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6963
## Specificity : 0.6207
## Pos Pred Value : 0.6474
## Neg Pred Value : 0.6714
## Prevalence : 0.5000
## Detection Rate : 0.3481
## Detection Prevalence : 0.5378
## Balanced Accuracy : 0.6585
##
## 'Positive' Class : X0
##
# Extract results
results <- decision_tree_model_cv$results
# Plotting the accuracy across different 'cp' values
ggplot(data = results, aes(x = cp, y = Accuracy)) +
geom_line() + # Line plot to show the trend
geom_point() + # Points to mark each tested 'cp' value
labs(title = "Accuracy vs Complexity Parameter (cp)",
x = "Complexity Parameter (cp)",
y = "Accuracy") +
theme_minimal() # Using a minimal theme for cleaner visualization
# Train the decision tree model using rpart with the optimal cp value
decision_tree <- rpart(Attrition_numeric ~ .,
data = trainingdata_balanced_ab,
method = "class",
control = rpart.control(cp = 0.01978022))
# If you wish to train using the entire dataset and visualize, otherwise skip this
# and use the model from the train() function for accuracy results.
# Plot the decision tree
rpart.plot(decision_tree,
main = "Decision Tree Plot",
extra = 104, # Display splits and class distribution
under = TRUE, # Display the numbers under the node
faclen = 0) # Factor length, use 0 to auto adjust
variable_importance <- decision_tree_model$variable.importance
# Order variables by importance in descending order
ordered_importance <- sort(variable_importance, decreasing = TRUE)
# Print the ordered variable importance
ordered_importance
## Age OverTime StockOptionLevel
## 785.6839521 533.4903002 369.9393083
## MaritalStatus MonthlyIncome BusinessTravel
## 276.8573304 206.9696411 109.6644593
## Education YearsAtCompany PercentSalaryHike
## 65.3445040 64.4883620 18.5971663
## TrainingTimesLastYear NumCompaniesWorked EnvironmentSatisfaction
## 16.7410552 15.4888790 13.1240286
## EducationField YearsSinceLastPromotion
## 4.1421277 0.3944884
# Provided variable importance scores
variable_importance <- c(
Age = 785.6839521, OverTime = 533.4903002, StockOptionLevel = 369.9393083,
MaritalStatus = 276.8573304, BusinessTravel = 109.6644593, Education = 65.3445040,
YearsAtCompany = 64.4883620, PercentSalaryHike = 18.5971663, TrainingTimesLastYear = 16.7410552,
NumCompaniesWorked = 15.4888790, EnvironmentSatisfaction = 13.1240286, EducationField = 4.1421277,
YearsSinceLastPromotion = 0.3944884
)
# Sort the importance scores in descending order
sorted_importance <- sort(variable_importance, decreasing = TRUE)
# Print sorted variable importance
print(sorted_importance)
## Age OverTime StockOptionLevel
## 785.6839521 533.4903002 369.9393083
## MaritalStatus BusinessTravel Education
## 276.8573304 109.6644593 65.3445040
## YearsAtCompany PercentSalaryHike TrainingTimesLastYear
## 64.4883620 18.5971663 16.7410552
## NumCompaniesWorked EnvironmentSatisfaction EducationField
## 15.4888790 13.1240286 4.1421277
## YearsSinceLastPromotion
## 0.3944884
# Create a data frame for plotting
importance_df <- data.frame(Variable = names(sorted_importance), Importance = sorted_importance)
# Plot using ggplot2 without reversal
ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance, fill = Importance)) +
geom_col() + # Use geom_col() which is equivalent to geom_bar(stat="identity")
scale_fill_gradient(low = "blue", high = "red") + # Color gradient from blue to red
coord_flip() + # Flip coordinates for horizontal bar plot
xlab("Variable") +
ylab("Importance") +
ggtitle("Variable Importance from Model") +
theme_minimal() + # Apply a minimal theme
theme(legend.position = "none") # Hide the legend if not required
#Full Decision Tree model with all predictors
cp_value <- 0.01 # Adjust this as necessary
# Rebuild the decision tree model with the new complexity parameter
tree_model <- rpart(Attrition_numeric ~ ., data = training_set, method = "class", cp = cp_value)
# Plot the decision tree with adjusted parameters
rpart.plot(tree_model, main="Decision Tree for Attrition Prediction",
box.palette="RdYlGn", # Use a red-yellow-green color palette
shadow.col="gray", # Add shadow for a 3D effect
cex=0.65, # Adjust text size as needed
tweak= 1, # Adjust the overall size of plot elements
fallen.leaves=FALSE, # Align leaf nodes at the bottom of the graph
type=4, # Use enhanced text and splits
extra=104) # Display extra node information
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# Evaluate the model on the testing set
predictions <- predict(tree_model, newdata = testing_set, type = "class")
confusionMatrix(predictions, testing_set$Attrition_numeric)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4015 365
## 1 1834 732
##
## Accuracy : 0.6834
## 95% CI : (0.6723, 0.6943)
## No Information Rate : 0.8421
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2291
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6864
## Specificity : 0.6673
## Pos Pred Value : 0.9167
## Neg Pred Value : 0.2853
## Prevalence : 0.8421
## Detection Rate : 0.5780
## Detection Prevalence : 0.6306
## Balanced Accuracy : 0.6769
##
## 'Positive' Class : 0
##
# Extract variable importance from the rpart model
var_importance <- as.data.frame(tree_model$variable.importance)
# Give names to the dataframe
names(var_importance) <- c("Importance")
# Sort the variables by importance
var_importance_sorted <- var_importance[order(-var_importance$Importance), , drop = FALSE]
# Create a barplot
barplot(var_importance_sorted$Importance, main = "Variable Importance",
xlab = "Variables", ylab = "Importance", las = 2, cex.names = 0.7)
# If you have a lot of variables and the names are cluttered, you can increase the margin size
par(mar = c(12, 5, 4, 2) + 0.1) # Adjust the bottom margin to fit variable names
barplot(var_importance_sorted$Importance, main = "Variable Importance",
xlab = "Variables", ylab = "Importance", las = 2, cex.names = 0.7)
var_importance_sorted <- data.frame(
Variable = rownames(var_importance_sorted),
Importance = var_importance_sorted$Importance
)
ggplot(var_importance_sorted, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flip coordinates to make bars horizontal
labs(title = "Variable Importance",
x = "Importance",
y = "") + # Y label is not needed since variables are on the Y axis
theme_minimal() +
theme(axis.text.y = element_text(angle = 0, hjust = 1, size = 8), # Adjust the text size as needed
axis.title.y = element_blank(), # Remove the Y axis title
plot.title = element_text(hjust = 0.5)) # Center the plot title
# Create subsets for each department
sales_data <- subset(ab_dataset_reduced, Department == "Sales")
hr_data <- subset(ab_dataset_reduced, Department == "Human Resources")
rnd_data <- subset(ab_dataset_reduced, Department == "Research & Development")
# Function to process a department dataset
process_department_data <- function(data) {
set.seed(123) # for reproducibility
# Split data into training and testing
index <- createDataPartition(y = data$Attrition_numeric, p = 0.7, list = FALSE)
training_set <- data[index, ]
testing_set <- data[-index, ]
# Balance attrition in the training set
downsampled_data <- downSample(x = training_set[, -which(names(training_set) == "Attrition_numeric")],
y = training_set$Attrition_numeric)
# Fix the outcome variable name after downsampling
downsampled_data$Attrition_numeric <- downsampled_data$Class
downsampled_data$Class <- NULL # Remove the 'Class' column as it's no longer needed
# Convert Attrition_numeric to a factor (if not already)
downsampled_data$Attrition_numeric <- factor(downsampled_data$Attrition_numeric, levels = c(0, 1))
# Run a decision tree model
tree_model <- rpart(Attrition_numeric ~ ., data = downsampled_data, method = "class")
# Predict on the testing set
predictions <- predict(tree_model, newdata = testing_set, type = "class")
# Evaluate the model
confusion <- confusionMatrix(predictions, testing_set$Attrition_numeric)
# Return the model and its evaluation
return(list(model = tree_model, evaluation = confusion))
}
# Process each department dataset
sales_results <- process_department_data(sales_data)
hr_results <- process_department_data(hr_data)
rnd_results <- process_department_data(rnd_data)
# Process each department dataset
sales_results <- process_department_data(sales_data)
hr_results <- process_department_data(hr_data)
rnd_results <- process_department_data(rnd_data)
# Print the confusion matrix for the Sales department
print("Confusion Matrix for Sales Department:")
## [1] "Confusion Matrix for Sales Department:"
print(sales_results$evaluation)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1029 116
## 1 301 220
##
## Accuracy : 0.7497
## 95% CI : (0.7282, 0.7703)
## No Information Rate : 0.7983
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3553
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7737
## Specificity : 0.6548
## Pos Pred Value : 0.8987
## Neg Pred Value : 0.4223
## Prevalence : 0.7983
## Detection Rate : 0.6176
## Detection Prevalence : 0.6873
## Balanced Accuracy : 0.7142
##
## 'Positive' Class : 0
##
# Print the confusion matrix for the HR department
print("Confusion Matrix for HR Department:")
## [1] "Confusion Matrix for HR Department:"
print(hr_results$evaluation)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 421 4
## 1 143 90
##
## Accuracy : 0.7766
## 95% CI : (0.7428, 0.8079)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4355
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7465
## Specificity : 0.9574
## Pos Pred Value : 0.9906
## Neg Pred Value : 0.3863
## Prevalence : 0.8571
## Detection Rate : 0.6398
## Detection Prevalence : 0.6459
## Balanced Accuracy : 0.8520
##
## 'Positive' Class : 0
##
# Print the confusion matrix for the R&D department
print("Confusion Matrix for R&D Department:")
## [1] "Confusion Matrix for R&D Department:"
print(rnd_results$evaluation)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2876 220
## 1 1078 446
##
## Accuracy : 0.719
## 95% CI : (0.7058, 0.732)
## No Information Rate : 0.8558
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2585
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7274
## Specificity : 0.6697
## Pos Pred Value : 0.9289
## Neg Pred Value : 0.2927
## Prevalence : 0.8558
## Detection Rate : 0.6225
## Detection Prevalence : 0.6701
## Balanced Accuracy : 0.6985
##
## 'Positive' Class : 0
##
# Calculate variable importance for each department's model
sales_var_imp <- varImp(sales_results$model, scale = FALSE)
hr_var_imp <- varImp(hr_results$model, scale = FALSE)
rnd_var_imp <- varImp(rnd_results$model, scale = FALSE)
# Print the variable importance for Sales department
print("Variable Importance for Sales Department:")
## [1] "Variable Importance for Sales Department:"
print(sales_var_imp)
## Overall
## Age 158.917796
## BusinessTravel 107.007484
## DistanceFromHome 83.735931
## Education 26.841194
## EnvironmentSatisfaction 25.082053
## JobInvolvement 65.167166
## JobSatisfaction 44.118585
## MaritalStatus 52.295666
## MonthlyIncome 95.513794
## NumCompaniesWorked 24.978251
## OverTime 36.146428
## PercentSalaryHike 35.233779
## PerformanceRating 2.892857
## StockOptionLevel 15.059482
## TrainingTimesLastYear 30.083053
## YearsAtCompany 59.115339
## YearsSinceLastPromotion 16.635839
## Department 0.000000
## EducationField 0.000000
## Gender 0.000000
## RelationshipSatisfaction 0.000000
## WorkLifeBalance 0.000000
# Print the variable importance for HR department
print("Variable Importance for HR Department:")
## [1] "Variable Importance for HR Department:"
print(hr_var_imp)
## Overall
## Age 62.367579
## BusinessTravel 22.913872
## DistanceFromHome 69.957259
## Education 16.652117
## EducationField 42.574613
## EnvironmentSatisfaction 63.398729
## JobInvolvement 9.319177
## JobSatisfaction 12.407505
## MaritalStatus 3.758586
## MonthlyIncome 27.340278
## NumCompaniesWorked 2.403565
## PercentSalaryHike 22.899110
## PerformanceRating 3.953970
## RelationshipSatisfaction 15.531891
## StockOptionLevel 29.644549
## TrainingTimesLastYear 43.370503
## YearsAtCompany 29.429738
## YearsSinceLastPromotion 3.508567
## Department 0.000000
## Gender 0.000000
## OverTime 0.000000
## WorkLifeBalance 0.000000
# Print the variable importance for R&D department
print("Variable Importance for R&D Department:")
## [1] "Variable Importance for R&D Department:"
print(rnd_var_imp)
## Overall
## Age 160.693968
## BusinessTravel 69.933187
## DistanceFromHome 6.121094
## Education 7.423855
## JobInvolvement 52.371749
## JobSatisfaction 34.568477
## MaritalStatus 95.858599
## MonthlyIncome 128.497173
## NumCompaniesWorked 8.517866
## OverTime 185.272132
## PercentSalaryHike 19.806168
## StockOptionLevel 107.883099
## TrainingTimesLastYear 45.172356
## YearsAtCompany 26.572917
## YearsSinceLastPromotion 10.655467
## Department 0.000000
## EducationField 0.000000
## EnvironmentSatisfaction 0.000000
## Gender 0.000000
## PerformanceRating 0.000000
## RelationshipSatisfaction 0.000000
## WorkLifeBalance 0.000000
# Plot variable importance for Sales department
plot(sales_var_imp, main="Variable Importance - Sales Department")
# Plot variable importance for HR department
plot(hr_var_imp, main="Variable Importance - HR Department")
# Plot variable importance for R&D department
plot(rnd_var_imp, main="Variable Importance - R&D Department")
# Create a data frame for the Sales department variable importance
sales_data <- data.frame(
Variable = c("Age", "BusinessTravel", "DailyRate", "DistanceFromHome",
"Education", "EnvironmentSatisfaction", "HourlyRate",
"JobInvolvement", "JobLevel", "JobRole"),
Importance = c(151.418909, 116.449477, 64.651043, 93.528052,
7.479869, 15.834588, 61.812332, 62.588595,
7.500696, 27.948940)
)
# Plotting
ggplot(sales_data, aes(x = reorder(Variable, -Importance), y = Importance)) +
geom_col(fill = "steelblue") +
labs(title = "Variable Importance for Sales Department",
x = "Variable",
y = "Importance") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create a data frame for the HR department variable importance
hr_data <- data.frame(
Variable = c("Age", "BusinessTravel", "DailyRate", "DistanceFromHome",
"Education", "EducationField", "EnvironmentSatisfaction",
"HourlyRate", "JobInvolvement", "JobRole"),
Importance = c(48.130178, 22.913872, 31.306332, 60.253362,
10.869100, 46.443127, 61.366961, 26.498422,
5.205372, 2.909091)
)
# Plotting
ggplot(hr_data, aes(x = reorder(Variable, -Importance), y = Importance)) +
geom_col(fill = "darkgreen") +
labs(title = "Variable Importance for HR Department",
x = "Variable",
y = "Importance") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create a data frame for the R&D department variable importance
rnd_data <- data.frame(
Variable = c("Age", "BusinessTravel", "DailyRate", "DistanceFromHome",
"Education", "JobInvolvement", "JobSatisfaction",
"MaritalStatus", "MonthlyIncome", "MonthlyRate"),
Importance = c(133.977348, 69.933187, 104.876065, 6.121094,
16.026592, 61.172651, 56.868178, 95.858599,
128.497173, 49.547281)
)
# Plotting
ggplot(rnd_data, aes(x = reorder(Variable, -Importance), y = Importance)) +
geom_col(fill = "firebrick") +
labs(title = "Variable Importance for R&D Department",
x = "Variable",
y = "Importance") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert predictions and actual data to factors with consistent levels
predicted_classes_factor <- factor(predicted_classes, levels = c(0, 1))
actual_classes_factor <- factor(testset_interaction$Attrition_numeric, levels = c(0, 1))
# Calculate and print the confusion matrix and related statistics for logistic regression
conf_matrix_logistic <- confusionMatrix(data = predicted_classes_factor, reference = actual_classes_factor)
print(conf_matrix_logistic)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 502 236
## 1 266 532
##
## Accuracy : 0.6732
## 95% CI : (0.6491, 0.6966)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3464
##
## Mcnemar's Test P-Value : 0.1956
##
## Sensitivity : 0.6536
## Specificity : 0.6927
## Pos Pred Value : 0.6802
## Neg Pred Value : 0.6667
## Prevalence : 0.5000
## Detection Rate : 0.3268
## Detection Prevalence : 0.4805
## Balanced Accuracy : 0.6732
##
## 'Positive' Class : 0
##
tree_predictions <- decision_tree_model_cv$pred
predicted_classes_tree <- factor(tree_predictions$pred, levels = c("X0", "X1"))
actual_classes_tree <- factor(tree_predictions$obs, levels = c("X0", "X1"))
# Calculate and print the confusion matrix and related statistics for decision tree
conf_matrix_tree <- confusionMatrix(data = predicted_classes_tree, reference = actual_classes_tree)
print(conf_matrix_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 9504 5177
## X1 4146 8473
##
## Accuracy : 0.6585
## 95% CI : (0.6528, 0.6641)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.317
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6963
## Specificity : 0.6207
## Pos Pred Value : 0.6474
## Neg Pred Value : 0.6714
## Prevalence : 0.5000
## Detection Rate : 0.3481
## Detection Prevalence : 0.5378
## Balanced Accuracy : 0.6585
##
## 'Positive' Class : X0
##
# Compute the confusion matrices
conf_matrix_logistic <- confusionMatrix(as.factor(predicted_classes), as.factor(testset_interaction$Attrition_numeric))
conf_matrix_tree <- confusionMatrix(factor(decision_tree_model_cv$pred$pred, levels = c("X0", "X1")),
factor(decision_tree_model_cv$pred$obs, levels = c("X0", "X1")))
# Function to extract metrics from confusion matrix
extract_metrics <- function(cm) {
data.frame(
Accuracy = cm$overall['Accuracy'],
Kappa = cm$overall['Kappa'],
Precision = cm$byClass['Pos Pred Value'],
Recall = cm$byClass['Sensitivity'],
F1 = cm$byClass['F1']
)
}
# Extract metrics
metrics_logistic <- extract_metrics(conf_matrix_logistic)
metrics_tree <- extract_metrics(conf_matrix_tree)
# Combine metrics into one dataframe for plotting
metrics_df <- rbind(metrics_logistic, metrics_tree)
metrics_df$Model <- c("Logistic Regression", "Decision Tree")
metrics_df <- reshape2::melt(metrics_df, id.vars = "Model")
# Plotting the metrics with additional enhancements
ggplot(metrics_df, aes(x = variable, y = value, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
geom_text(aes(label = round(value, 2)),
position = position_dodge(width = 0.8),
vjust = -0.25,
size = 3.5) +
labs(title = "Comparison of Model Metrics", x = "Metric", y = "Value") +
theme_minimal(base_size = 11.5) + # Increasing base size for better readability
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom") + # Adjust legend position
theme(plot.title = element_text(size = 20, face = "bold"), # Enhance title style
axis.title = element_text(size = 14)) # Enhance axis titles
library(pROC)
library(ggplot2)
library(plotly)
# Function to plot ROC curve and calculate AUC
plot_roc_auc <- function(predicted_probs, actuals, model_name) {
roc_obj <- roc(actuals, predicted_probs)
auc_value <- auc(roc_obj)
# Create a data frame for plotting
roc_data <- data.frame(
FPR = roc_obj$specificities,
TPR = roc_obj$sensitivities
)
# Plot ROC curve using ggplot2
roc_plot <- ggplot(roc_data, aes(x = FPR, y = TPR)) +
geom_line(color = ifelse(model_name == "Logistic Regression", "blue", "green"), size = 1.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
ggtitle(paste("ROC Curve -", model_name)) +
xlab("False Positive Rate (1 - Specificity)") +
ylab("True Positive Rate (Sensitivity)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
# Convert ggplot to plotly object for interactivity
roc_plotly <- ggplotly(roc_plot) %>% layout(legend = list(x = 0.8, y = 0.2)) # Adjust legend position
# Add AUC value to the plot
auc_text <- paste("AUC =", round(auc_value, 2))
return(list(plot = roc_plotly, auc = auc_text))
}
# Calculate predicted probabilities for logistic regression model (using "logistic_model_interaction")
predicted_probs_logistic <- predict(logistic_model_interaction, newdata = testset_interaction, type = "response")
# Plot ROC curve and calculate AUC for logistic regression model
roc_logistic <- plot_roc_auc(predicted_probs_logistic, testset_interaction$Attrition_numeric, "Logistic Regression")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate predicted probabilities for decision tree model
predicted_probs_tree <- predict(decision_tree_model_cv, newdata = testset_interaction, type = "prob")[, "X1"]
# Plot ROC curve and calculate AUC for decision tree model
roc_tree <- plot_roc_auc(predicted_probs_tree, testset_interaction$Attrition_numeric, "Decision Tree")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_logistic$auc # Print AUC for logistic regression model
## [1] "AUC = 0.73"
roc_tree$auc # Print AUC for decision tree model
## [1] "AUC = 0.65"
roc_logistic$plot # Display ROC plot for logistic regression model
roc_tree$plot # Display ROC plot for decision tree model